--- zzzz-none-000/linux-3.10.107/net/core/dev.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/core/dev.c 2021-02-04 17:41:59.000000000 +0000 @@ -97,8 +97,12 @@ #include #include #include +#ifdef CONFIG_AVM_PA +#include +#endif #include #include +#include #include #include #include @@ -118,6 +122,7 @@ #include #include #include +#include #include #include #include @@ -129,8 +134,15 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "net-sysfs.h" +#include "skbuff_debug.h" /* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 @@ -144,6 +156,14 @@ struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); + +static int (*avm_recvhook)(struct sk_buff *skb); +static int (*avm_early_recvhook)(struct sk_buff *skb); + /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. @@ -166,7 +186,13 @@ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -seqcount_t devnet_rename_seq; +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { @@ -335,6 +361,18 @@ *******************************************************************************/ +void set_avm_recvhook(int (*recvhook)(struct sk_buff *skb)) +{ + avm_recvhook = recvhook; +} +EXPORT_SYMBOL(set_avm_recvhook); + +void set_avm_early_recvhook(int (*recvhook)(struct sk_buff *skb)) +{ + avm_early_recvhook = recvhook; +} +EXPORT_SYMBOL(set_avm_early_recvhook); + /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be @@ -354,9 +392,10 @@ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &ptype_all; else - return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + return pt->dev ? &pt->dev->ptype_specific : + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** @@ -450,10 +489,14 @@ */ void dev_add_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct packet_offload *elem; spin_lock(&offload_lock); - list_add_rcu(&po->list, head); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); @@ -471,7 +514,7 @@ * and must not be freed until after all the CPU's have gone * through a quiescent state. */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; @@ -643,6 +686,49 @@ *******************************************************************************/ /** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. + */ + +int dev_get_iflink(const struct net_device *dev) +{ + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); + + return dev->ifindex; +} +EXPORT_SYMBOL(dev_get_iflink); + +/** + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. + * + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info; + + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; + + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; + + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + +/** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find @@ -883,23 +969,25 @@ EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -907,7 +995,7 @@ } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device @@ -1072,6 +1160,7 @@ */ int dev_change_name(struct net_device *dev, const char *newname) { + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; @@ -1099,16 +1188,25 @@ return err; } + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; write_seqcount_end(&devnet_rename_seq); return ret; } write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1128,6 +1226,9 @@ err = ret; write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1194,8 +1295,12 @@ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1232,9 +1337,7 @@ * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_poll_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1249,13 +1352,12 @@ if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - netpoll_rx_enable(dev); + netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1287,7 +1389,7 @@ if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1301,7 +1403,10 @@ ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1312,12 +1417,12 @@ * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1331,7 +1436,7 @@ ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); + netpoll_poll_enable(dev); } return 0; @@ -1342,39 +1447,34 @@ int retval; LIST_HEAD(single); - /* Temporarily disable netpoll until the interface is down */ - retval = netpoll_rx_disable(dev); - if (retval) - return retval; - - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); - netpoll_rx_enable(dev); return retval; } -static int dev_close_many(struct list_head *head) +int dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); + if (unlink) + list_del_init(&dev->close_list); } - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); return 0; } +EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. @@ -1387,22 +1487,14 @@ */ int dev_close(struct net_device *dev) { - int ret = 0; if (dev->flags & IFF_UP) { LIST_HEAD(single); - /* Block netpoll rx while the interface is going down */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; - - list_add(&dev->unreg_list, &single); - dev_close_many(&single); + list_add(&dev->close_list, &single); + dev_close_many(&single, true); list_del(&single); - - netpoll_rx_enable(dev); } - return ret; + return 0; } EXPORT_SYMBOL(dev_close); @@ -1417,21 +1509,28 @@ */ void dev_disable_lro(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); + struct net_device *lower_dev; + struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1464,7 +1563,7 @@ goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1472,7 +1571,7 @@ if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1488,10 +1587,11 @@ goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1529,10 +1629,11 @@ for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1542,6 +1643,25 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} + +/** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function @@ -1552,11 +1672,28 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); +#ifdef CONFIG_NET_INGRESS +static struct static_key ingress_needed __read_mostly; + +void net_inc_ingress_queue(void) +{ + static_key_slow_inc(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_key_slow_dec(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; @@ -1628,8 +1765,7 @@ __net_timestamp(SKB); \ } \ -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { unsigned int len; @@ -1648,6 +1784,25 @@ return false; } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_orphan_frags(skb, GFP_ATOMIC) || + unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->priority = 0; + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1669,32 +1824,7 @@ */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - skb_orphan(skb); - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - skb->skb_iif = 0; - skb->dev = dev; - skb_dst_drop(skb); - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); - nf_reset_trace(skb); - return netif_rx(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1708,6 +1838,24 @@ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline void deliver_ptype_list_skb(struct sk_buff *skb, + struct packet_type **pt, + struct net_device *orig_dev, + __be16 type, + struct list_head *ptype_list) +{ + struct packet_type *ptype, *pt_prev = *pt; + + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->type != type) + continue; + if (pt_prev) + deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + *pt = pt_prev; +} + static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) @@ -1731,45 +1879,54 @@ struct packet_type *ptype; struct sk_buff *skb2 = NULL; struct packet_type *pt_prev = NULL; + struct list_head *ptype_list = &ptype_all; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { +again: + list_for_each_entry_rcu(ptype, ptype_list, list) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ - if ((ptype->dev == dev || !ptype->dev) && - (!skb_loop_sk(ptype, skb))) { - if (pt_prev) { - deliver_skb(skb2, pt_prev, skb->dev); - pt_prev = ptype; - continue; - } + if (skb_loop_sk(ptype, skb)) + continue; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } - net_timestamp_set(skb2); + /* need to clone skb, done only once */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out_unlock; + + net_timestamp_set(skb2); + + /* skb->nh should be correctly + * set by sender, so that the second statement is + * just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb_reset_mac_header(skb2); + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); + } - if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { - net_crit_ratelimited("protocol %04x is buggy, dev %s\n", - ntohs(skb2->protocol), - dev->name); - skb_reset_network_header(skb2); - } + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } - skb2->transport_header = skb2->network_header; - skb2->pkt_type = PACKET_OUTGOING; - pt_prev = ptype; - } + if (ptype_list == &ptype_all) { + ptype_list = &dev->ptype_all; + goto again; } +out_unlock: if (pt_prev) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); @@ -1912,7 +2069,8 @@ return new_map; } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; struct xps_map *map, *new_map; @@ -2073,7 +2231,7 @@ } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2124,7 +2282,7 @@ unsigned long flags; local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; @@ -2139,30 +2297,89 @@ } EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { + enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) +{ + return (struct dev_kfree_skb_cb *)skb->cb; +} + +void netif_schedule_queue(struct netdev_queue *txq) { - if (atomic_dec_and_test(&skb->users)) { - struct softnet_data *sd; - unsigned long flags; + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - skb->next = sd->completion_queue; - sd->completion_queue = skb; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); } } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(netif_tx_wake_queue); -void dev_kfree_skb_any(struct sk_buff *skb) +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ + unsigned long flags; + + if (likely(atomic_read(&skb->users) == 1)) { + smp_rmb(); + atomic_set(&skb->users, 0); + } else if (likely(!atomic_dec_and_test(&skb->users))) { + return; + } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__dev_kfree_skb_irq); + +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); + __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(__dev_kfree_skb_any); /** @@ -2196,21 +2413,52 @@ } EXPORT_SYMBOL(netif_device_attach); +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); @@ -2264,10 +2512,9 @@ } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { @@ -2280,18 +2527,7 @@ type = eth->h_proto; } - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; - } - - return type; + return __vlan_get_protocol(skb, type, depth); } /** @@ -2304,26 +2540,17 @@ { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); - __skb_pull(skb, skb->mac_len); + __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } @@ -2357,6 +2584,8 @@ * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2366,12 +2595,17 @@ skb_warn_bad_offload(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_cow_head(skb, 0); + if (err < 0) return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + skb_reset_mac_header(skb); skb_reset_mac_len(skb); @@ -2396,7 +2630,7 @@ * 2. No high memory really exists on this machine. */ -static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2424,229 +2658,249 @@ return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) { - struct dev_gso_cb *cb; - - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); + if (eth_p_mpls(type)) + features &= skb->dev->mpls_features; - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); + return features; } - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) { - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; + return features; } +#endif static netdev_features_t harmonize_features(struct sk_buff *skb, - __be16 protocol, - const struct net_device *dev, - netdev_features_t features) + netdev_features_t features) { + int tmp; + __be16 type; + + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); + if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, protocol)) { + !can_checksum_protocol(features, type)) { features &= ~NETIF_F_ALL_CSUM; } - if (illegal_highdma(dev, skb)) + if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev) +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return features; +} +EXPORT_SYMBOL(passthru_features_check); + +static netdev_features_t dflt_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return vlan_features_check(skb, features); +} + +netdev_features_t netif_skb_features(struct sk_buff *skb) { - __be16 protocol = skb->protocol; + struct net_device *dev = skb->dev; netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, protocol, dev, features); - } + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; - features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_vlan_tagged(skb)) + features = netdev_intersect_features(features, + dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + else + features &= dflt_features_check(skb, dev, features); - if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) { - return harmonize_features(skb, protocol, dev, features); - } else { - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, protocol, dev, features); + return harmonize_features(skb, features); +} +EXPORT_SYMBOL(netif_skb_features); + +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) +{ + unsigned int len; + int rc; + + /* If this skb has been fast forwarded then we don't want it to + * go to any taps (by definition we're trying to bypass them). + */ + if (unlikely(!skb->fast_forwarded)) { + if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) + dev_queue_xmit_nit(skb, dev); } - return harmonize_features(skb, protocol, dev, features); -} -EXPORT_SYMBOL(netif_skb_dev_features); +#ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (!dev->eth_mangle_tx || + (skb = dev->eth_mangle_tx(dev, skb)) != NULL) +#else + if (1) +#endif + { + len = skb->len; + trace_net_dev_start_xmit(skb, dev); +#ifdef CONFIG_NET_DEBUG_SKBUFF_LEAK + skb_track_funccall(skb, ops->ndo_start_xmit); +#endif + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); + } else { + rc = NETDEV_TX_OK; + } -/* - * Returns true if either: - * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, - netdev_features_t features) -{ - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && - !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && - !(features & NETIF_F_SG))); + return rc; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - const struct net_device_ops *ops = dev->netdev_ops; + struct sk_buff *skb = first; int rc = NETDEV_TX_OK; - unsigned int skb_len; - if (likely(!skb->next)) { - netdev_features_t features; - - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); +#ifdef CONFIG_AVM_PA + avm_pa_dev_snoop_transmit(AVM_PA_DEVINFO(dev), skb); +#endif - features = netif_skb_features(skb); + while (skb) { + struct sk_buff *next = skb->next; - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; + skb->next = NULL; + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } - skb->vlan_tci = 0; + skb = next; + if (netif_xmit_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; } + } - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; +out: + *ret = rc; + return skb; +} - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } - } +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +{ + netdev_features_t features; - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); + if (skb->next) + return skb; - skb_len = skb->len; - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; + + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } } -gso: - do { - struct sk_buff *nskb = skb->next; + return skb; - skb->next = nskb->next; - nskb->next = NULL; +out_kfree_skb: + kfree_skb(skb); +out_null: + return NULL; +} - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; - skb_len = nskb->len; - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; - } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); - -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + for (; skb != NULL; skb = next) { + next = skb->next; + skb->next = NULL; + + /* in case skb wont be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; } -out_kfree_skb: - kfree_skb(skb); -out: - return rc; + return head; } +EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2691,8 +2945,8 @@ /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + * This permits __QDISC___STATE_RUNNING owner to get the lock more + * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) @@ -2709,12 +2963,10 @@ * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; @@ -2725,7 +2977,9 @@ rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); +#if defined(CONFIG_AVM_PA) && defined(AVM_PA_MARK_SHAPED) + avm_pa_mark_shaped(skb); +#endif rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2741,7 +2995,7 @@ return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); @@ -2757,14 +3011,18 @@ #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); @@ -2777,9 +3035,89 @@ } EXPORT_SYMBOL(dev_loopback_xmit); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[skb->sender_cpu - 1]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk && + sk_fullsock(sk) && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb, + void *accel_priv) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + if (skb->sender_cpu == 0) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); + else + queue_index = __netdev_pick_tx(dev, skb); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** - * dev_queue_xmit - transmit a buffer + * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit + * @accel_priv: private data used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -2802,15 +3140,22 @@ * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; +#ifdef CONFIG_NET_DEBUG_SKBUFF_LEAK + skb_track_caller(skb); +#endif + skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2818,7 +3163,25 @@ skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2850,11 +3213,15 @@ if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; + skb = validate_xmit_skb(skb, dev); + if (!skb) + goto drop; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2875,16 +3242,29 @@ } rc = -ENETDOWN; +drop: rcu_read_unlock_bh(); - kfree_skb(skb); + atomic_long_inc(&dev->tx_dropped); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } + +int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ + return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); + /*======================================================================= Receiver routines @@ -2910,6 +3290,8 @@ /* One global table that all flow-based protocols share. */ struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +u32 rps_cpu_mask __read_mostly; +EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; @@ -2917,7 +3299,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -2938,7 +3320,7 @@ flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -2966,15 +3348,17 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { - struct netdev_rx_queue *rxqueue; - struct rps_map *map; + const struct rps_sock_flow_table *sock_flow_table; + struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; + struct rps_map *map; int cpu = -1; - u16 tcpu; + u32 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " @@ -2982,44 +3366,45 @@ dev->name, index, dev->real_num_rx_queues); goto done; } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; + rxqueue += index; + } + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + + flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (map) { - if (map->len == 1 && - !rcu_access_pointer(rxqueue->rps_flow_table)) { - tcpu = map->cpus[0]; - if (cpu_online(tcpu)) - cpu = tcpu; - goto done; - } - } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + if (!flow_table && !map) goto done; - } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; - flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { - u16 next_cpu; struct rps_dev_flow *rflow; + u32 next_cpu; + u32 ident; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; - tcpu = rflow->cpu; + /* First check into global flow table if there is a match */ + ident = sock_flow_table->ents[hash & sock_flow_table->mask]; + if ((ident ^ hash) & ~rps_cpu_mask) + goto try_rps; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + next_cpu = ident & rps_cpu_mask; + + /* OK, now we know there is a match, + * we can look at the local (per receive queue) flow table + */ + rflow = &flow_table->flows[hash & flow_table->mask]; + tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3027,23 +3412,24 @@ * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } - if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +try_rps: + if (map) { + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3074,14 +3460,14 @@ struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; + unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) @@ -3113,7 +3499,7 @@ static int rps_ipi_queued(struct softnet_data *sd) { #ifdef CONFIG_RPS - struct softnet_data *mysd = &__get_cpu_var(softnet_data); + struct softnet_data *mysd = this_cpu_ptr(&softnet_data); if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; @@ -3126,6 +3512,46 @@ return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = this_cpu_ptr(&softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -3135,14 +3561,18 @@ { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { - if (skb_queue_len(&sd->input_pkt_queue)) { + if (!netif_running(skb->dev)) + goto drop; + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); @@ -3161,6 +3591,7 @@ goto enqueue; } +drop: sd->dropped++; rps_unlock(sd); @@ -3171,28 +3602,13 @@ return NET_RX_DROP; } -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb) { int ret; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; +#ifdef CONFIG_NET_DEBUG_SKBUFF_LEAK + skb_track_caller(skb); +#endif net_timestamp_check(netdev_tstamp_prequeue, skb); @@ -3222,14 +3638,38 @@ } return ret; } + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + trace_netif_rx_entry(skb); + + return netif_rx_internal(skb); +} EXPORT_SYMBOL(netif_rx); int netif_rx_ni(struct sk_buff *skb) { int err; + trace_netif_rx_ni_entry(skb); + preempt_disable(); - err = netif_rx(skb); + err = netif_rx_internal(skb); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -3240,7 +3680,7 @@ static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; @@ -3255,7 +3695,10 @@ clist = clist->next; WARN_ON(atomic_read(&skb->users)); - trace_kfree_skb(skb, net_tx_action); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } @@ -3277,7 +3720,7 @@ root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3287,7 +3730,7 @@ &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -3304,68 +3747,55 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - int result = TC_ACT_OK; - struct Qdisc *q; - - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rxq->qdisc; - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - - if (!rxq || rxq->qdisc == &noop_qdisc) - goto out; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct tcf_result cl_res; + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out here. + */ + if (!cl) + return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - switch (ing_filter(skb, rxq)) { + qdisc_skb_cb(skb)->pkt_len = skb->len; + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); case TC_ACT_STOLEN: + case TC_ACT_QUEUED: kfree_skb(skb); return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + skb_do_redirect(skb); + return NULL; + default: + break; } - -out: - skb->tc_verd = 0; +#endif /* CONFIG_NET_CLS_ACT */ return skb; } -#endif /** * netdev_is_rx_handler_busy - check if receive handler is registered @@ -3389,7 +3819,7 @@ * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3436,6 +3866,9 @@ } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); +int (*athrs_fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly; +EXPORT_SYMBOL_GPL(athrs_fast_nat_recv); + /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. @@ -3443,35 +3876,47 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; } } +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ +#ifdef CONFIG_NETFILTER_INGRESS + if (nf_hook_ingress_active(skb)) { + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return nf_hook_ingress(skb); + } +#endif /* CONFIG_NETFILTER_INGRESS */ + return 0; +} + static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; - struct net_device *null_or_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; + int (*fast_recv)(struct sk_buff *skb); net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3481,6 +3926,11 @@ pt_prev = NULL; +#if IS_ENABLED(CONFIG_AVM_NET_SKB_INPUT_DEV) + if (!skb->input_dev) + skb->input_dev = skb->dev; +#endif + another_round: skb->skb_iif = skb->dev->ifindex; @@ -3488,11 +3938,19 @@ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } + fast_recv = rcu_dereference(athrs_fast_nat_recv); + if (fast_recv) { + if (fast_recv(skb)) { + ret = NET_RX_SUCCESS; + goto out; + } + } + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -3504,25 +3962,54 @@ goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + + list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; } skip_taps: +#ifdef CONFIG_NET_INGRESS + if (static_key_false(&ingress_needed)) { + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; + + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif #ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; + skb->tc_verd = 0; ncls: #endif - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; - if (vlan_tx_tag_present(skb)) { +#ifdef CONFIG_AVM_PA +#ifdef CONFIG_AVM_NET_DEBUG_SKBUFF_LEAK + skb_track_funccall(skb, avm_pa_dev_receive); +#endif + if (avm_pa_dev_receive(AVM_PA_DEVINFO(skb->dev), skb) == 0) { + ret = NET_RX_SUCCESS; + goto out; + } +#endif + + if (avm_early_recvhook && (*avm_early_recvhook)(skb)) { + /* + * paket consumed by hook + */ + ret = NET_RX_SUCCESS; + goto out; + } + + if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; @@ -3554,8 +4041,8 @@ } } - if (unlikely(vlan_tx_tag_present(skb))) { - if (vlan_tx_tag_get_id(skb)) + if (unlikely(skb_vlan_tag_present(skb))) { + if (skb_vlan_tag_get_id(skb)) skb->pkt_type = PACKET_OTHERHOST; /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() @@ -3564,19 +4051,29 @@ skb->vlan_tci = 0; } - /* deliver only exact match when indicated */ - null_or_dev = deliver_exact ? skb->dev : NULL; + if (skb && avm_recvhook && (*avm_recvhook)(skb)) { + /* + * paket consumed by hook + */ + ret = NET_RX_SUCCESS; + goto out; + } type = skb->protocol; - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_dev || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + + /* deliver only exact match when indicated */ + if (likely(!deliver_exact)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &ptype_base[ntohs(type) & + PTYPE_HASH_MASK]); + } + + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &orig_dev->ptype_specific); + + if (unlikely(skb->dev != orig_dev)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &skb->dev->ptype_specific); } if (pt_prev) { @@ -3623,22 +4120,7 @@ return ret; } -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -3665,6 +4147,28 @@ rcu_read_unlock(); return ret; } + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + trace_netif_receive_skb_entry(skb); + + return netif_receive_skb_internal(skb); +} EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending @@ -3673,7 +4177,7 @@ static void flush_backlog(void *arg) { struct net_device *dev = arg; - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); struct sk_buff *skb, *tmp; rps_lock(sd); @@ -3714,7 +4218,7 @@ if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb); + err = ptype->callbacks.gro_complete(skb, 0); break; } rcu_read_unlock(); @@ -3726,7 +4230,7 @@ } out: - return netif_receive_skb(skb); + return netif_receive_skb_internal(skb); } /* napi->gro_list contains packets ordered by age. @@ -3762,21 +4266,69 @@ { struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_metadata_dst_cmp(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), + skb_mac_header(skb), maclen); NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; + } +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + pinfo->frags[0].page_offset += grow; + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } @@ -3788,11 +4340,15 @@ struct list_head *head = &offload_base; int same_flow; enum gro_result ret; + int grow; + + if (skb->gro_skip) + goto normal; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); @@ -3807,6 +4363,25 @@ NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; + NAPI_GRO_CB(skb)->gro_remcsum_start = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -3831,39 +4406,35 @@ if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) + if (NAPI_GRO_CB(skb)->flush) goto normal; - napi->gro_count++; + if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { + struct sk_buff *nskb = napi->gro_list; + + /* locate the end of the list to select the 'oldest' flow */ + while (nskb->next) { + pp = &nskb->next; + nskb = *pp; + } + *pp = NULL; + nskb->next = NULL; + napi_gro_complete(nskb); + } else { + napi->gro_count++; + } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->tail += grow; - skb->data_len -= grow; - - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); - } - } - + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); ok: return ret; @@ -3872,12 +4443,39 @@ goto pull; } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb(skb)) + if (netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -3886,10 +4484,13 @@ break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { + skb_dst_drop(skb); kmem_cache_free(skbuff_head_cache, skb); - else + skbuff_debugobj_deactivate(skb); + } else { __kfree_skb(skb); + } break; case GRO_HELD: @@ -3900,27 +4501,10 @@ return ret; } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (skb->mac_header == skb->tail && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, - skb_frag_size(frag0), - skb->end - skb->tail); - } -} - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + trace_napi_gro_receive_entry(skb); + skb_gro_reset_offset(skb); return napi_skb_finish(dev_gro_receive(napi, skb), skb); @@ -3929,12 +4513,18 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; @@ -3945,25 +4535,23 @@ struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); - if (skb) - napi->skb = skb; + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + napi->skb = skb; } return skb; } EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: + __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) + if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -3979,39 +4567,42 @@ return ret; } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); + eth = skb_gro_header_fast(skb, 0); + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + return NULL; } + } else { + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; } - - skb_gro_pull(skb, sizeof(*eth)); + __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. + * special handling. + * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; -out: return skb; } @@ -4022,12 +4613,39 @@ if (!skb) return GRO_DROP; + trace_napi_gro_frags_entry(skb); + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -4045,8 +4663,8 @@ struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); + smp_call_function_single_async(remsd->cpu, + &remsd->csd); remsd = next; } } else @@ -4054,25 +4672,32 @@ local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); @@ -4088,24 +4713,23 @@ } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); @@ -4117,30 +4741,42 @@ * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); - ____napi_schedule(&__get_cpu_var(softnet_data), n); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); - list_del(&n->poll_list); - smp_mb__before_clear_bit(); + list_del_init(&n->poll_list); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +void napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4151,17 +4787,98 @@ if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + if (n->gro_list) { + unsigned long timeout = 0; + + if (work_done) + timeout = n->dev->gro_flush_timeout; + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + else + napi_gro_flush(n, false); + } + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(napi_complete_done); + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + if (napi->gro_list) + napi_schedule(napi); + + return HRTIMER_NORESTART; } -EXPORT_SYMBOL(napi_complete); void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; @@ -4180,168 +4897,173 @@ } EXPORT_SYMBOL(netif_napi_add); -void netif_napi_del(struct napi_struct *napi) +void napi_disable(struct napi_struct *n) { - struct sk_buff *skb, *next; + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + +void netif_napi_del(struct napi_struct *napi) +{ list_del_init(&napi->dev_list); napi_free_frags(napi); - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); - } - + kfree_skb_list(napi->gro_list); napi->gro_list = NULL; napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); -static void net_rx_action(struct softirq_action *h) +struct napi_struct *get_current_napi_context(void) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + + return sd->current_napi; +} +EXPORT_SYMBOL(get_current_napi_context); + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; void *have; + int work, weight; - local_irq_disable(); + list_del_init(&n->poll_list); - while (!list_empty(&sd->poll_list)) { - struct napi_struct *n; - int work, weight; + have = netpoll_poll_lock(n); - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + weight = n->weight; - local_irq_enable(); + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + WARN_ON_ONCE(work > weight); - have = netpoll_poll_lock(n); + if (likely(work < weight)) + goto out_unlock; - weight = n->weight; + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + goto out_unlock; + } - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); - } + napi_gro_flush(n, HZ >= 1000); + } - WARN_ON_ONCE(work > weight); + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + goto out_unlock; + } - budget -= work; + list_add_tail(&n->poll_list, repoll); - local_irq_disable(); +out_unlock: + netpoll_poll_unlock(have); - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. - */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); - napi_complete(n); - local_irq_disable(); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - local_irq_enable(); - napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); - } - list_move_tail(&n->poll_list, &sd->poll_list); - } + return work; +} + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); + + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); + + for (;;) { + struct napi_struct *n; + + if (list_empty(&list)) { + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + return; + break; } - netpoll_poll_unlock(have); + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + sd->time_squeeze++; + break; + } } -out: - net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif + local_irq_disable(); - return; + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); -softnet_break: - sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; + net_rps_action_and_irq_enable(sd); } -struct netdev_upper { +struct netdev_adjacent { struct net_device *dev; - bool master; - struct list_head list; - struct rcu_head rcu; - struct list_head search_list; -}; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_upper *upper; + /* upper master flag, there can only be one master device per list */ + bool master; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); - } -} + /* counter for the number of times this device was added to us */ + u16 ref_nr; -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) -{ - LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; - bool ret = false; + /* private field for the users */ + void *private; - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; - break; - } - __append_search_uppers(&search_list, upper->dev); - } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; -} + struct list_head list; + struct rcu_head rcu; +}; -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, + struct list_head *adj_list) { - struct netdev_upper *upper; + struct netdev_adjacent *adj; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } @@ -4360,7 +5082,7 @@ { ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4371,13 +5093,12 @@ * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -bool netdev_has_any_upper_dev(struct net_device *dev) +static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->all_adj_list.upper); } -EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device @@ -4388,21 +5109,185 @@ */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->all_adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device @@ -4412,20 +5297,271 @@ */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + u16 ref_nr, + struct list_head *dev_list, + void *private, bool master) +{ + struct netdev_adjacent *adj; + int ret; + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (adj) { + adj->ref_nr += ref_nr; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = ref_nr; + adj->private = private; + dev_hold(adj_dev); + + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } + + return 0; + +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + kfree(adj); + dev_put(adj_dev); + + return ret; +} + +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + u16 ref_nr, + struct list_head *dev_list) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); + BUG(); + } + + if (adj->ref_nr > ref_nr) { + pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, + ref_nr, adj->ref_nr-ref_nr); + adj->ref_nr -= ref_nr; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) +{ + int ret; + + ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, + private, master); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, + private, false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + return ret; + } + + return 0; +} + +static int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr) +{ + return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + NULL, false); +} + +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list) +{ + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); +} + +static void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr) +{ + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); +} + +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); + + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); + return ret; + } + + return 0; +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} + +static void __netdev_addr_mask(unsigned char *mask, const unsigned char *addr, + struct net_device *dev) +{ + int i; + + for (i = 0; i < dev->addr_len; i++) + mask[i] |= addr[i] ^ dev->dev_addr[i]; +} + +static void __netdev_upper_mask(unsigned char *mask, struct net_device *dev, + struct net_device *lower) +{ + struct net_device *cur; + struct list_head *iter; + + netdev_for_each_upper_dev_rcu(dev, cur, iter) { + __netdev_addr_mask(mask, cur->dev_addr, lower); + __netdev_upper_mask(mask, cur, lower); + } +} + +static void __netdev_update_addr_mask(struct net_device *dev) +{ + unsigned char mask[MAX_ADDR_LEN]; + struct net_device *cur; + struct list_head *iter; + + memset(mask, 0, sizeof(mask)); + __netdev_upper_mask(mask, dev, dev); + memcpy(dev->local_addr_mask, mask, dev->addr_len); + + netdev_for_each_lower_dev(dev, cur, iter) + __netdev_update_addr_mask(cur); +} + +static void netdev_update_addr_mask(struct net_device *dev) +{ + rcu_read_lock(); + __netdev_update_addr_mask(dev); + rcu_read_unlock(); +} + static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) + struct net_device *upper_dev, bool master, + void *private) { - struct netdev_upper *upper; + struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4433,31 +5569,104 @@ return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) + if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); - - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); + if (ret) + return ret; + + /* Now that we linked these devs, make all the upper_dev's + * all_adj_list.upper visible to every dev's all_adj_list.lower an + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); + ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); + if (ret) + goto rollback_mesh; + } + } + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); + ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); + ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); + if (ret) + goto rollback_lower_mesh; + } + + netdev_update_addr_mask(dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + return ret; } /** @@ -4473,7 +5682,7 @@ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4491,10 +5700,18 @@ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, true); + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4506,19 +5723,176 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; - + struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); + + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); + + netdev_update_addr_mask(dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info) +{ + struct netdev_notifier_bonding_info info; + + memcpy(&info.bonding_info, bonding_info, + sizeof(struct netdev_bonding_info)); + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + +static void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +static void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -4527,7 +5901,7 @@ ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; @@ -4570,6 +5944,8 @@ dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -4589,7 +5965,7 @@ unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) @@ -4598,22 +5974,9 @@ } EXPORT_SYMBOL(dev_set_promiscuity); -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); @@ -4636,9 +5999,30 @@ if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } return 0; } + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} EXPORT_SYMBOL(dev_set_allmulti); /* @@ -4663,10 +6047,10 @@ * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -4674,7 +6058,6 @@ if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } -EXPORT_SYMBOL(__dev_set_rx_mode); void dev_set_rx_mode(struct net_device *dev) { @@ -4747,18 +6130,18 @@ */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - if (!ret) - dev_set_rx_mode(dev); - } - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -4769,16 +6152,20 @@ int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); } return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -4787,8 +6174,13 @@ } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** @@ -4802,21 +6194,29 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + dev->mtu = new_mtu; + return 0; +} + /** * dev_set_mtu - Change maximum transfer unit * @dev: device @@ -4826,8 +6226,7 @@ */ int dev_set_mtu(struct net_device *dev, int new_mtu) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; + int err, orig_mtu; if (new_mtu == dev->mtu) return 0; @@ -4839,14 +6238,25 @@ if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + orig_mtu = dev->mtu; + err = __dev_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __dev_set_mtu(dev, orig_mtu); + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + } + } return err; } EXPORT_SYMBOL(dev_set_mtu); @@ -4884,6 +6294,7 @@ if (err) return err; dev->addr_assign_type = NET_ADDR_SET; + netdev_update_addr_mask(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; @@ -4910,6 +6321,62 @@ EXPORT_SYMBOL(dev_change_carrier); /** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); + +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + +/** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -4930,15 +6397,18 @@ /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) { struct net_device *dev, *tmp; + LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -4961,18 +6431,23 @@ } /* If device is running, close it first. */ - dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; + on_each_cpu(flush_backlog, dev, 1); } synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -4984,7 +6459,8 @@ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -4995,6 +6471,9 @@ if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -5021,6 +6500,48 @@ list_del(&single); } +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } + } + + return features; +} + +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + lower->wanted_features &= ~feature; + netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + } + } +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -5078,13 +6599,22 @@ } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } int __netdev_update_features(struct net_device *dev) { + struct net_device *upper, *lower; netdev_features_t features; - int err = 0; + struct list_head *iter; + int err = -1; ASSERT_RTNL(); @@ -5096,26 +6626,42 @@ /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); + /* some features can't be enabled if they're off an an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + if (dev->features == features) - return 0; + goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } +sync_lower: + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + if (!err) dev->features = features; - return 1; + return err < 0 ? 0 : 1; } /** @@ -5177,18 +6723,21 @@ } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; + size_t sz = count * sizeof(*rx); BUG_ON(count < 1); - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) - return -ENOMEM; - + rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!rx) { + rx = vzalloc(sz); + if (!rx) + return -ENOMEM; + } dev->_rx = rx; for (i = 0; i < count; i++) @@ -5211,17 +6760,26 @@ #endif } +static void netif_free_tx_queues(struct net_device *dev) +{ + kvfree(dev->_tx); +} + static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); - BUG_ON(count < 1); - - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) - return -ENOMEM; + if (count < 1 || count > 0xffff) + return -EINVAL; + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!tx) { + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; + } dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5230,6 +6788,17 @@ return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + /** * register_netdevice - register a network device * @dev: device to register @@ -5264,8 +6833,6 @@ spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - dev->iflink = -1; - ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -5295,9 +6862,6 @@ else if (__dev_get_by_index(net, dev->ifindex)) goto err_uninit; - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ @@ -5305,13 +6869,8 @@ dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - /* Turn on no cache copy if HW is doing checksum */ if (!(dev->flags & IFF_LOOPBACK)) { dev->hw_features |= NETIF_F_NOCACHE_COPY; - if (dev->features & NETIF_F_ALL_CSUM) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; - } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -5322,6 +6881,10 @@ */ dev->hw_enc_features |= NETIF_F_SG; + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5368,7 +6931,7 @@ */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; @@ -5573,19 +7136,31 @@ dev->reg_state = NETREG_UNREGISTERED; - on_each_cpu(flush_backlog, dev, 1); - netdev_wait_allrefs(dev); /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(!list_empty(&dev->ptype_all)); + BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); - +#ifdef CONFIG_AVM_PA +#ifdef AVM_PA_DEV_UNREGISTER_SYNC + avm_pa_dev_unregister_sync(AVM_PA_DEVINFO(dev)); +#else + avm_pa_dev_unregister(AVM_PA_DEVINFO(dev)); +#endif +#endif if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -5637,6 +7212,7 @@ netdev_stats_to_stats64(storage, &dev->stats); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += atomic_long_read(&dev->tx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -5652,7 +7228,7 @@ if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - queue->qdisc = &noop_qdisc; + RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); #endif @@ -5669,19 +7245,28 @@ } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +void netdev_freemem(struct net_device *dev) +{ + char *addr = (char *)dev - dev->padded; + + kvfree(addr); +} + /** * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs + * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { @@ -5696,7 +7281,7 @@ return NULL; } -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; @@ -5712,7 +7297,9 @@ /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!p) + p = vzalloc(alloc_size); if (!p) return NULL; @@ -5721,7 +7308,7 @@ dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; if (dev_addr_init(dev)) goto free_pcpu; @@ -5733,20 +7320,35 @@ dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); + INIT_LIST_HEAD(&dev->ptype_all); + INIT_LIST_HEAD(&dev->ptype_specific); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; +#ifdef CONFIG_AVM_PA + avm_pa_dev_init(AVM_PA_DEVINFO(dev)); +#endif setup(dev); + if (!dev->tx_queue_len) { + dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) @@ -5754,9 +7356,13 @@ #endif strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_ingress_init(dev); + return dev; free_all: @@ -5765,13 +7371,8 @@ free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); -#ifdef CONFIG_RPS - kfree(dev->_rx); -#endif - -free_p: - kfree(p); +free_dev: + netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); @@ -5788,11 +7389,9 @@ { struct napi_struct *p, *n; - release_net(dev_net(dev)); - - kfree(dev->_tx); -#ifdef CONFIG_RPS - kfree(dev->_rx); + netif_free_tx_queues(dev); +#ifdef CONFIG_SYSFS + kvfree(dev->_rx); #endif kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -5808,7 +7407,7 @@ /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + netdev_freemem(dev); return; } @@ -5974,7 +7573,7 @@ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -5984,20 +7583,18 @@ /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); + if (__dev_get_by_index(net, dev->ifindex)) dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; - } /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); @@ -6013,7 +7610,7 @@ * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; @@ -6075,11 +7672,11 @@ /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx(skb); + netif_rx_ni(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx(skb); + netif_rx_ni(skb); input_queue_head_incr(oldsd); } @@ -6115,7 +7712,7 @@ } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; @@ -6172,51 +7769,45 @@ return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -6225,11 +7816,9 @@ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); @@ -6284,6 +7873,34 @@ rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(&netdev_unregistering_wq, &wait); + for (;;) { + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6295,10 +7912,21 @@ struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); @@ -6352,24 +7980,18 @@ for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); - memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); - sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); - sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; - sd->csd.flags = 0; sd->cpu = i; #endif sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; - sd->backlog.gro_list = NULL; - sd->backlog.gro_count = 0; } dev_boot_phase = 0; @@ -6393,7 +8015,7 @@ open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc;