/* * Copyright (c) 2019 AVM GmbH . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "offdp.h" #include #include #include #include #include #include #include #include #include #include #include #if IS_ENABLED(CONFIG_AVM_PA) #include #endif /* * Virtual Endpoints " . _ * .* +/ '. * ________ ________ | l, ', * / / / / . . \, * / _______/ | / _______/ | * ___| | |___| | |___ ________ ___________ * | Arp | / | TCP | / / ) / / n | Mt. | * |_______|/ |_______|/ / / / _______/ | ######. | Netdev | * _____________________________/ / |==| | | =,# o | | => 5ns | * __(o)___(o)___(o)___(o)___(o))/ | | UDP | / .# \T' | L__________| * |######################## || * ( |######### CPU #########) || * ( =|###/ \######/ \####/ || * ( | O | | O | || * \__/ \__/ || * * "What, 5ns?! Let's try that shady path over there, it looks like a shortcut!" */ struct offdp_stats { unsigned int packets; unsigned int bytes; unsigned int dropped; }; struct offdp_vep { struct hlist_node dev2vep_node; struct hlist_node vep2dev_node; struct rcu_head rcu; unsigned long handle; struct net_device *dev; struct kobject kobj; struct offdp_stats __percpu *rx_stats; struct offdp_stats __percpu *rx_slow_stats; struct offdp_stats __percpu *tx_stats; }; static struct kobj_type offdp_vep_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; static struct kobject *virtualep_kobj; static struct kset *veps_kset; static struct ktd_suite *test_suite; /* Give exactly one page to the virtual endpoint entries */ #define VEPS_PER_PAGE (PAGE_SIZE / sizeof(struct offdp_vep)) struct offdp_vep veps[VEPS_PER_PAGE] __page_aligned_bss; static HANDLE_POOL_DECLARE(vep_idx_pool, VEPS_PER_PAGE); #define LOOKUP_HASH_BITS 6 static DEFINE_READ_MOSTLY_HASHTABLE(dev2vep, LOOKUP_HASH_BITS); static DEFINE_READ_MOSTLY_HASHTABLE(vep2dev, LOOKUP_HASH_BITS); /* Protect against concurrent registrations. */ DEFINE_SPINLOCK(vep_writer_lock); static enum offdp_rv _vep_from_netdev(const struct net_device *dev, struct offdp_vep **vep) { struct offdp_vep *cur; /* Don't do NULL */ *vep = &veps[0]; hash_for_each_possible_rcu (dev2vep, cur, dev2vep_node, dev->ifindex) { if (cur->dev != dev) continue; *vep = cur; return OFFDP_SUCCESS; } return OFFDP_ERR_DEV_RESOLVE; } static enum offdp_rv vep_get_from_netdev(const struct net_device *dev, struct offdp_vep **vep) { enum offdp_rv rv; rv = _vep_from_netdev(dev, vep); if (rv == OFFDP_SUCCESS) kobject_get(&(*vep)->kobj); return rv; } static enum offdp_rv _vep_from_handle(unsigned long handle, struct offdp_vep **vep) { struct offdp_vep *cur; /* Don't do NULL */ *vep = &veps[0]; hash_for_each_possible_rcu (vep2dev, cur, vep2dev_node, handle) { if (cur->handle != handle) continue; *vep = cur; return OFFDP_SUCCESS; } return OFFDP_ERR_DEV_RESOLVE; } static void vep_put(struct offdp_vep *vep) { if (vep) kobject_put(&vep->kobj); } static unsigned int all_cpu_sum(unsigned int *count) { unsigned long cpu; unsigned int sum; sum = 0; for_each_online_cpu (cpu) { sum += *per_cpu_ptr(count, cpu); } return sum; } static ssize_t stats_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct offdp_vep *vep = container_of(kobj, struct offdp_vep, kobj); struct offdp_stats *stats; if (!strcmp(attr->attr.name, "stats_rx")) stats = vep->rx_stats; else if (!strcmp(attr->attr.name, "stats_rx_slow")) stats = vep->rx_slow_stats; else if (!strcmp(attr->attr.name, "stats_tx")) stats = vep->tx_stats; else return -EACCES; return sprintf(buf, "Packets:\t%u (%u dropped)\n" "Bytes: \t%u\n", all_cpu_sum(&stats->packets), all_cpu_sum(&stats->dropped), all_cpu_sum(&stats->bytes)); } static struct kobj_attribute stats_rx_attr = __ATTR(stats_rx, 0400, stats_show, NULL); static struct kobj_attribute stats_rx_slow_attr = __ATTR(stats_rx_slow, 0400, stats_show, NULL); static struct kobj_attribute stats_tx_attr = __ATTR(stats_tx, 0400, stats_show, NULL); static void offdp_vep_sysfs_async(void *vep_arg, async_cookie_t cookie) { struct offdp_vep *vep = vep_arg; kobject_init(&vep->kobj, &offdp_vep_kobj_type); vep->kobj.kset = veps_kset; kobject_add(&vep->kobj, NULL, "vep%d", vep - &veps[0]); sysfs_create_link(&vep->kobj, &vep->dev->dev.kobj, vep->dev->name); sysfs_create_file(&vep->kobj, &stats_rx_attr.attr); sysfs_create_file(&vep->kobj, &stats_rx_slow_attr.attr); sysfs_create_file(&vep->kobj, &stats_tx_attr.attr); } int offdp_vep_register(struct net_device *dev) { unsigned long vep_idx; struct offdp_vep *vep; if (_vep_from_netdev(dev, &vep) == OFFDP_SUCCESS) { pr_info("Cannot register twice: %s\n", dev->name); return -EEXIST; } vep_idx = handle_alloc(vep_idx_pool, ARRAY_SIZE(veps)); if (vep_idx >= ARRAY_SIZE(veps)) return -ENOMEM; pr_debug("alloc vep %lu\n", vep_idx); vep = &veps[vep_idx]; /* Counters are zero'd by alloc_percpu. */ vep->rx_stats = alloc_percpu_gfp(typeof(*vep->rx_stats), GFP_ATOMIC); vep->tx_stats = alloc_percpu_gfp(typeof(*vep->tx_stats), GFP_ATOMIC); vep->rx_slow_stats = alloc_percpu_gfp(typeof(*vep->rx_slow_stats), GFP_ATOMIC); if (!vep->rx_stats | !vep->tx_stats | !vep->rx_slow_stats) goto freeerr; if (OFFDP_IS_ERR(offdp_backend_vep_add(dev, &vep->handle))) { goto freeerr; } dev_hold(dev); vep->dev = dev; async_schedule(offdp_vep_sysfs_async, vep); spin_lock_bh(&vep_writer_lock); hash_add_rcu(vep2dev, &vep->vep2dev_node, vep->handle); hash_add_rcu(dev2vep, &vep->dev2vep_node, dev->ifindex); spin_unlock_bh(&vep_writer_lock); return 0; freeerr: handle_free(vep_idx, vep_idx_pool); return -1; } EXPORT_SYMBOL(offdp_vep_register); static void _vep_free_async(void *vep_ptr, async_cookie_t cookie) { struct offdp_vep *vep = vep_ptr; kobject_put(&vep->kobj); memset(&vep->kobj, 0, sizeof(vep->kobj)); handle_free(vep - veps, vep_idx_pool); } static void _vep_rcu_free(struct rcu_head *head) { struct offdp_vep *vep = container_of(head, struct offdp_vep, rcu); pr_debug("free vep %d\n", vep - veps); offdp_backend_vep_remove(vep->handle); dev_put(vep->dev); vep->dev = NULL; /* Defer kobject removal as removing attributes may block */ async_schedule(_vep_free_async, vep); } int offdp_vep_unregister(struct net_device *dev) { struct offdp_vep *cur, *vep; spin_lock_bh(&vep_writer_lock); vep = NULL; //hash_for_each_possible (vep2dev, cur, vep2dev_node, dev->ifindex) { hash_for_each_possible (dev2vep, cur, dev2vep_node, dev->ifindex) { if (cur->dev != dev) continue; vep = cur; break; } if (!vep) { spin_unlock_bh(&vep_writer_lock); return -1; } hash_del_rcu(&vep->vep2dev_node); hash_del_rcu(&vep->dev2vep_node); call_rcu(&vep->rcu, _vep_rcu_free); spin_unlock_bh(&vep_writer_lock); synchronize_rcu(); return 0; } EXPORT_SYMBOL(offdp_vep_unregister); #define OFFDP_SLOW_MARK 0xbE39 int offdp_vep_fast_rcv(struct sk_buff *skb) { struct offdp_vep *vep; enum offdp_rv rv; unsigned long handle; unsigned long bytes; if (skb->mark == OFFDP_SLOW_MARK) { kfree_skb(skb); return -1; } rcu_read_lock(); rv = _vep_from_netdev(skb->dev, &vep); if (rv != OFFDP_SUCCESS) { rcu_read_unlock(); kfree_skb(skb); return -1; } handle = vep->handle; bytes = skb->len; rv = offdp_backend_vep_fast_rcv(handle, skb); if (rv != OFFDP_SUCCESS) { raw_cpu_inc(vep->rx_stats->dropped); rcu_read_unlock(); errstat_track(edom, rv); return -1; } raw_cpu_inc(vep->rx_stats->packets); raw_cpu_add(vep->rx_stats->bytes, bytes); rcu_read_unlock(); return 0; } EXPORT_SYMBOL(offdp_vep_fast_rcv); int offdp_vep_fast_xmit(unsigned long vep_handle, struct sk_buff *skb) { struct net_device *dev; struct offdp_vep *vep; enum offdp_rv rv; rcu_read_lock(); if ((rv = _vep_from_handle(vep_handle, &vep)) != OFFDP_SUCCESS) { rcu_read_unlock(); errstat_track(edom, rv); return -1; } raw_cpu_inc(vep->tx_stats->packets); raw_cpu_add(vep->tx_stats->bytes, skb->len); dev = vep->dev; dev_hold(dev); skb->dev = dev; rcu_read_unlock(); /* * Dummy netdevs are usually in place when software device trickery is * involved. This is currently done by the offload_pa tests. * Do not show these incomplete devices to the kernel, but call their * xmit op directly. */ if (skb->dev->reg_state == NETREG_DUMMY) { skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); } else { dev_queue_xmit(skb); } dev_put(dev); return 0; } EXPORT_SYMBOL(offdp_vep_fast_xmit); int offdp_vep_fast_rcv_raw(const struct net_device *dev, void *buf, unsigned long offset, unsigned long len) { struct offdp_vep *vep; enum offdp_rv rv; unsigned long handle; unsigned long bytes; rcu_read_lock(); rv = _vep_from_netdev(dev, &vep); if (rv != OFFDP_SUCCESS) { rcu_read_unlock(); return -1; } handle = vep->handle; bytes = len; rv = offdp_backend_vep_fast_rcv_raw(handle, buf, offset, len); if (rv != OFFDP_SUCCESS) { rcu_read_unlock(); return -1; } raw_cpu_inc(vep->rx_stats->packets); raw_cpu_add(vep->rx_stats->bytes, bytes); rcu_read_unlock(); return 0; } EXPORT_SYMBOL(offdp_vep_fast_rcv_raw); int offdp_vep_slow_rcv(unsigned long vep_handle, struct sk_buff *skb) { struct offdp_vep *vep; struct net_device *dev; rcu_read_lock(); if (_vep_from_handle(vep_handle, &vep) != OFFDP_SUCCESS) { rcu_read_unlock(); return -1; } dev = vep->dev; dev_hold(dev); skb->dev = dev; rcu_read_unlock(); raw_cpu_inc(vep->rx_slow_stats->packets); raw_cpu_add(vep->rx_slow_stats->bytes, skb->len); #if IS_ENABLED(CONFIG_AVM_PA) if (avm_pa_dev_receive(AVM_PA_DEVINFO(dev), skb) == AVM_PA_RX_STOLEN) { dev_put(dev); return 0; } #endif skb->protocol = eth_type_trans(skb, skb->dev); /* This packet will likely pass the fast_rcv hook, so mark it as a * 'slow' packet. */ skb->mark = OFFDP_SLOW_MARK; netif_rx(skb); dev_put(dev); return 0; } EXPORT_SYMBOL(offdp_vep_slow_rcv); bool offdp_is_vep(struct net_device *dev) { struct offdp_vep *vep; enum offdp_rv rv; rv = _vep_from_netdev(dev, &vep); return (rv == OFFDP_SUCCESS); } EXPORT_SYMBOL(offdp_is_vep); int offdp_ep_platform_data(const struct net_device *dev, void *data, size_t len) { struct offdp_vep *vep; unsigned long *vep_ptr; enum offdp_rv rv; rcu_read_lock(); /* Check the platform-independant vep management for possible results. */ if (vep_get_from_netdev(dev, &vep) == OFFDP_SUCCESS) { vep_ptr = &vep->handle; } else { vep = NULL; vep_ptr = NULL; } rcu_read_unlock(); /* Let the backend collect all relevant information. */ rv = offdp_backend_ep_platform_data(dev, vep_ptr, data, len); errstat_track(edom, rv); vep_put(vep); return (rv != OFFDP_SUCCESS); } EXPORT_SYMBOL(offdp_ep_platform_data); static struct net_device bubble_netdev; static void probe_netif_rx(void *arg, struct sk_buff *skb) { struct sk_buff *skb0; if (likely(skb->mark != OFFDP_SLOW_MARK)) { if (likely(!skb_shared(skb) && !skb_cloned(skb)) && !offdp_vep_fast_rcv_raw( skb->dev, skb->head, skb_headroom(skb) - skb->dev->hard_header_len, skb->len + skb->dev->hard_header_len)) { skb->head = NULL; } else { if (!(skb0 = skb_clone(skb, GFP_ATOMIC))) return; skb_push(skb0, skb->dev->hard_header_len); if (offdp_vep_fast_rcv(skb0)) return; } /* Hack to invalidate the skb and force the caller to pop that * bubble as early as possible. */ skb->dev = &bubble_netdev; skb->len = 0; } } static void tp_iter(struct tracepoint *tp, void *priv) { if (!strcmp(tp->name, "netif_rx")) tracepoint_probe_register(tp, probe_netif_rx, NULL); } enum offdp_rv offdp_vep_tracepoint_hook(void) { for_each_kernel_tracepoint(tp_iter, NULL); return OFFDP_SUCCESS; } static ktd_ret_t offdp_vep_register_test(void *arg) { struct offdp_vep *cur; struct net_device tmpdev; int i, bkt, free_veps, free_backend; unsigned long *backend_handles; size_t backend_handles_sz; init_dummy_netdev(&tmpdev); tmpdev.pcpu_refcnt = alloc_percpu(int); free_veps = ARRAY_SIZE(veps); rcu_read_lock(); hash_for_each_rcu (vep2dev, bkt, cur, vep2dev_node) { free_veps--; } rcu_read_unlock(); backend_handles_sz = ARRAY_SIZE(veps); backend_handles = kmalloc(backend_handles_sz * sizeof(*backend_handles), GFP_ATOMIC); KTD_EXPECT(backend_handles); free_backend = 0; while (offdp_backend_vep_add(&tmpdev, &backend_handles[free_backend]) == OFFDP_SUCCESS) { /* Backend limit for VEPs can be higher than the frontend * limit. Reallocate as needed. */ if (++free_backend == backend_handles_sz) { backend_handles_sz *= 2; backend_handles = krealloc( backend_handles, backend_handles_sz * sizeof(*backend_handles), GFP_ATOMIC); } } KTD_EXPECT(free_backend > 0); free_veps = min(free_veps, free_backend); while (free_backend) offdp_backend_vep_remove(backend_handles[free_backend--]); for (i = 0; i < free_veps; i++) KTD_EXPECT(!offdp_vep_register(&tmpdev)); KTD_EXPECT(offdp_vep_register(&tmpdev)); /* Due to RCU the unregister call is not effective immediately. A * successive call to register may or may not fail. * KTD_EXPECT(!offdp_vep_unregister(&tmpdev)); * KTD_EXPECT(!offdp_vep_register(&tmpdev)); */ for (i = 0; i < free_veps; i++) KTD_EXPECT(!offdp_vep_unregister(&tmpdev)); free_percpu(tmpdev.pcpu_refcnt); kfree(backend_handles); return KTD_PASSED; } static ktd_ret_t offdp_vep_register_stress(unsigned int cpuid) { struct net_device tmpdev; int i = 500; init_dummy_netdev(&tmpdev); tmpdev.pcpu_refcnt = alloc_percpu(int); while (i--) { KTD_EXPECT(!offdp_vep_register(&tmpdev)); schedule(); KTD_EXPECT(!offdp_vep_unregister(&tmpdev)); } free_percpu(tmpdev.pcpu_refcnt); return KTD_PASSED; } static ssize_t register_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count; char *path; path = kobject_get_path(virtualep_kobj, GFP_KERNEL); if (!path) return 0; count = snprintf(buf, PAGE_SIZE, "usage:\n" "echo DEVICE > /sys%s/%s\n", path, attr->attr.name); kfree(path); return count; } static ssize_t register_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct net_device *dev; char name[ARRAY_SIZE(dev->name)]; ssize_t rv = count; strlcpy(name, buf, min(ARRAY_SIZE(name), count + 1)); dev = dev_get_by_name(&init_net, strim(name)); if (!dev) return -ENODEV; if (!strcmp(attr->attr.name, "register")) { pr_info("register: %s\n", dev->name); if (offdp_vep_register(dev)) rv = -EACCES; } else { pr_info("unregister: %s\n", dev->name); if (offdp_vep_unregister(dev)) rv = -EACCES; } dev_put(dev); return rv; } static struct kobj_attribute register_attr = __ATTR(register, 0600, register_show, register_store); static struct kobj_attribute unregister_attr = __ATTR(unregister, 0600, register_show, register_store); enum offdp_rv offdp_virtualep_init(void) { virtualep_kobj = kobject_create_and_add("virtualep", offdp_kobj); sysfs_create_file(virtualep_kobj, ®ister_attr.attr); sysfs_create_file(virtualep_kobj, &unregister_attr.attr); veps_kset = kset_create_and_add("veps", NULL, virtualep_kobj); test_suite = ktd_suite_create(THIS_MODULE->name); ktd_register_concurrent(test_suite, "offdp_vep_register_stress", offdp_vep_register_stress); ktd_register(test_suite, "offdp_vep_register_test", offdp_vep_register_test, NULL); return OFFDP_SUCCESS; } void offdp_virtualep_exit(void) { struct offdp_vep *cur; int bkt; /* Unregister all remaining endpoints for backend consistency. */ spin_lock_bh(&vep_writer_lock); hash_for_each (vep2dev, bkt, cur, vep2dev_node) { offdp_backend_vep_remove(cur->handle); kobject_del(&cur->kobj); /* XXX leak without _put()? */ } /* Consume all vep entries to lock out late registration attempts. */ while (handle_alloc(vep_idx_pool, ARRAY_SIZE(veps)) <= ARRAY_SIZE(veps)) ; spin_unlock_bh(&vep_writer_lock); ktd_suite_destroy(test_suite); kset_unregister(veps_kset); kobject_put(virtualep_kobj); }