/* * Pseudo-driver for the intermediate queue device. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Authors: Patrick McHardy, * * The first version was written by Martin Devera, * * Credits: Jan Rafaj * - Update patch to 2.4.21 * Sebastian Strollo * - Fix "Dead-loop on netdevice imq"-issue * Marcel Sebek * - Update to 2.6.2-rc1 * * After some time of inactivity there is a group taking care * of IMQ again: http://www.linuximq.net * * * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 * including the following changes: * * - Correction of ipv6 support "+"s issue (Hasso Tepper) * - Correction of imq_init_devs() issue that resulted in * kernel OOPS unloading IMQ as module (Norbert Buchmuller) * - Addition of functionality to choose number of IMQ devices * during kernel config (Andre Correa) * - Addition of functionality to choose how IMQ hooks on * PRE and POSTROUTING (after or before NAT) (Andre Correa) * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) * * * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were * released with almost no problems. 2.6.14-x was released * with some important changes: nfcache was removed; After * some weeks of trouble we figured out that some IMQ fields * in skb were missing in skbuff.c - skb_clone and copy_skb_header. * These functions are correctly patched by this new patch version. * * Thanks for all who helped to figure out all the problems with * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully * I didn't forget anybody). I apologize again for my lack of time. * * * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead * of qdisc_restart() and moved qdisc_run() to tasklet to avoid * recursive locking. New initialization routines to fix 'rmmod' not * working anymore. Used code from ifb.c. (Jussi Kivilinna) * * 2008/08/06 - 2.6.26 - (JK) * - Replaced tasklet with 'netif_schedule()'. * - Cleaned up and added comments for imq_nf_queue(). * * 2009/04/12 * - Add skb_save_cb/skb_restore_cb helper functions for backuping * control buffer. This is needed because qdisc-layer on kernels * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) * - Add better locking for IMQ device. Hopefully this will solve * SMP issues. (Jussi Kivilinna) * - Port to 2.6.27 * - Port to 2.6.28 * - Port to 2.6.29 + fix rmmod not working * * 2009/04/20 - (Jussi Kivilinna) * - Use netdevice feature flags to avoid extra packet handling * by core networking layer and possibly increase performance. * * 2009/09/26 - (Jussi Kivilinna) * - Add imq_nf_reinject_lockless to fix deadlock with * imq_nf_queue/imq_nf_reinject. * * Also, many thanks to pablo Sebastian Greco for making the initial * patch and to those who helped the testing. * * More info at: http://www.linuximq.net/ (Andre Correa) */ #include #include #include #include #include #include #include #include #include #include #include #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #include #endif #include #include #include static nf_hookfn imq_nf_hook; static struct nf_hook_ops imq_ingress_ipv4 = { .hook = imq_nf_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_PRE_ROUTING, #if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) .priority = NF_IP_PRI_MANGLE + 1 #else .priority = NF_IP_PRI_NAT_DST + 1 #endif }; static struct nf_hook_ops imq_egress_ipv4 = { .hook = imq_nf_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_POST_ROUTING, #if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) .priority = NF_IP_PRI_LAST #else .priority = NF_IP_PRI_NAT_SRC - 1 #endif }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static struct nf_hook_ops imq_ingress_ipv6 = { .hook = imq_nf_hook, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_PRE_ROUTING, #if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) .priority = NF_IP6_PRI_MANGLE + 1 #else .priority = NF_IP6_PRI_NAT_DST + 1 #endif }; static struct nf_hook_ops imq_egress_ipv6 = { .hook = imq_nf_hook, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_POST_ROUTING, #if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) .priority = NF_IP6_PRI_LAST #else .priority = NF_IP6_PRI_NAT_SRC - 1 #endif }; #endif #if defined(CONFIG_IMQ_NUM_DEVS) static unsigned int numdevs = CONFIG_IMQ_NUM_DEVS; #else static unsigned int numdevs = IMQ_MAX_DEVS; #endif static DEFINE_SPINLOCK(imq_nf_queue_lock); static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; static struct net_device_stats *imq_get_stats(struct net_device *dev) { return &dev->stats; } /* called for packets kfree'd in qdiscs at places other than enqueue */ static void imq_skb_destructor(struct sk_buff *skb) { struct nf_queue_entry *entry = skb->nf_queue_entry; if (entry) { nf_queue_entry_release_refs(entry); kfree(entry); } skb_restore_cb(skb); /* kfree backup */ } /* locking not needed when called from imq_nf_queue */ static void imq_nf_reinject_lockless(struct nf_queue_entry *entry, unsigned int verdict) { int status; if (!entry->next_outfn) { nf_reinject(entry, verdict); return; } status = entry->next_outfn(entry, entry->next_queuenum); if (status < 0) { nf_queue_entry_release_refs(entry); kfree_skb(entry->skb); kfree(entry); } } static void imq_nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { int status; if (!entry->next_outfn) { spin_lock_bh(&imq_nf_queue_lock); nf_reinject(entry, verdict); spin_unlock_bh(&imq_nf_queue_lock); return; } rcu_read_lock(); local_bh_disable(); status = entry->next_outfn(entry, entry->next_queuenum); local_bh_enable(); if (status < 0) { nf_queue_entry_release_refs(entry); kfree_skb(entry->skb); kfree(entry); } rcu_read_unlock(); } static int imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) { dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; skb->imq_flags = 0; skb->destructor = NULL; skb_restore_cb(skb); /* restore skb->cb */ dev->trans_start = jiffies; imq_nf_reinject(skb->nf_queue_entry, NF_ACCEPT); return 0; } static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num) { struct net_device *dev; struct sk_buff *skb_orig, *skb, *skb_shared; struct Qdisc *q; struct netdev_queue *txq; int users, index; int retval = -EINVAL; index = entry->skb->imq_flags & IMQ_F_IFMASK; if (unlikely(index > numdevs - 1)) { if (net_ratelimit()) printk(KERN_WARNING "IMQ: invalid device specified, highest is %u\n", numdevs - 1); retval = -EINVAL; goto out; } /* check for imq device by index from cache */ dev = imq_devs_cache[index]; if (unlikely(!dev)) { char buf[8]; /* get device by name and cache result */ snprintf(buf, sizeof(buf), "imq%d", index); dev = dev_get_by_name(&init_net, buf); if (!dev) { /* not found ?!*/ BUG(); retval = -ENODEV; goto out; } imq_devs_cache[index] = dev; dev_put(dev); } if (unlikely(!(dev->flags & IFF_UP))) { entry->skb->imq_flags = 0; imq_nf_reinject_lockless(entry, NF_ACCEPT); retval = 0; goto out; } dev->last_rx = jiffies; skb = entry->skb; skb_orig = NULL; /* skb has owner? => make clone */ if (unlikely(skb->destructor)) { skb_orig = skb; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { retval = -ENOMEM; goto out; } entry->skb = skb; } skb->nf_queue_entry = entry; dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; txq = dev_pick_tx(dev, skb); q = rcu_dereference(txq->qdisc); if (unlikely(!q->enqueue)) goto packet_not_eaten_by_imq_dev; spin_lock_bh(qdisc_lock(q)); users = atomic_read(&skb->users); skb_shared = skb_get(skb); /* increase reference count by one */ skb_save_cb(skb_shared); /* backup skb->cb, as qdisc layer will overwrite it */ qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ if (likely(atomic_read(&skb_shared->users) == users + 1)) { kfree_skb(skb_shared); /* decrease reference count by one */ skb->destructor = &imq_skb_destructor; /* cloned? */ if (skb_orig) kfree_skb(skb_orig); /* free original */ spin_unlock_bh(qdisc_lock(q)); /* schedule qdisc dequeue */ __netif_schedule(q); retval = 0; goto out; } else { skb_restore_cb(skb_shared); /* restore skb->cb */ /* qdisc dropped packet and decreased skb reference count of * skb, so we don't really want to and try refree as that would * actually destroy the skb. */ spin_unlock_bh(qdisc_lock(q)); goto packet_not_eaten_by_imq_dev; } packet_not_eaten_by_imq_dev: /* cloned? restore original */ if (skb_orig) { kfree_skb(skb); entry->skb = skb_orig; } retval = -1; out: return retval; } static struct nf_queue_handler nfqh = { .name = "imq", .outfn = imq_nf_queue, }; static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, const struct net_device *indev, const struct net_device *outdev, int (*okfn)(struct sk_buff *)) { if (pskb->imq_flags & IMQ_F_ENQUEUE) return NF_QUEUE; return NF_ACCEPT; } static int imq_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static int imq_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static const struct net_device_ops imq_netdev_ops = { .ndo_open = imq_open, .ndo_stop = imq_close, .ndo_start_xmit = imq_dev_xmit, .ndo_get_stats = imq_get_stats, }; static void imq_setup(struct net_device *dev) { dev->netdev_ops = &imq_netdev_ops; dev->type = ARPHRD_VOID; dev->mtu = 16000; dev->tx_queue_len = 11000; dev->flags = IFF_NOARP; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) { int ret = 0; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { ret = -EINVAL; goto end; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { ret = -EADDRNOTAVAIL; goto end; } } return 0; end: printk(KERN_WARNING "IMQ: imq_validate failed (%d)\n", ret); return ret; } static struct rtnl_link_ops imq_link_ops __read_mostly = { .kind = "imq", .priv_size = 0, .setup = imq_setup, .validate = imq_validate, }; static int __init imq_init_hooks(void) { int err; nf_register_queue_imq_handler(&nfqh); err = nf_register_hook(&imq_ingress_ipv4); if (err) goto err1; err = nf_register_hook(&imq_egress_ipv4); if (err) goto err2; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) err = nf_register_hook(&imq_ingress_ipv6); if (err) goto err3; err = nf_register_hook(&imq_egress_ipv6); if (err) goto err4; #endif return 0; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) err4: nf_unregister_hook(&imq_ingress_ipv6); err3: nf_unregister_hook(&imq_egress_ipv4); #endif err2: nf_unregister_hook(&imq_ingress_ipv4); err1: nf_unregister_queue_imq_handler(); return err; } static int __init imq_init_one(int index) { struct net_device *dev; int ret; dev = alloc_netdev(0, "imq%d", imq_setup); if (!dev) return -ENOMEM; ret = dev_alloc_name(dev, dev->name); if (ret < 0) goto fail; dev->rtnl_link_ops = &imq_link_ops; ret = register_netdevice(dev); if (ret < 0) goto fail; return 0; fail: free_netdev(dev); return ret; } static int __init imq_init_devs(void) { int err, i; if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { printk(KERN_ERR "IMQ: numdevs has to be betweed 1 and %u\n", IMQ_MAX_DEVS); return -EINVAL; } rtnl_lock(); err = __rtnl_link_register(&imq_link_ops); for (i = 0; i < numdevs && !err; i++) err = imq_init_one(i); if (err) { __rtnl_link_unregister(&imq_link_ops); memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); } rtnl_unlock(); return err; } static int __init imq_init_module(void) { int err; #if defined(CONFIG_IMQ_NUM_DEVS) BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); #endif err = imq_init_devs(); if (err) { printk(KERN_ERR "IMQ: Error trying imq_init_devs(net)\n"); return err; } err = imq_init_hooks(); if (err) { printk(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); rtnl_link_unregister(&imq_link_ops); memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); return err; } printk(KERN_INFO "IMQ driver loaded successfully.\n"); #if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n"); #else printk(KERN_INFO "\tHooking IMQ after NAT on PREROUTING.\n"); #endif #if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) printk(KERN_INFO "\tHooking IMQ before NAT on POSTROUTING.\n"); #else printk(KERN_INFO "\tHooking IMQ after NAT on POSTROUTING.\n"); #endif return 0; } static void __exit imq_unhook(void) { #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) nf_unregister_hook(&imq_ingress_ipv6); nf_unregister_hook(&imq_egress_ipv6); #endif nf_unregister_hook(&imq_ingress_ipv4); nf_unregister_hook(&imq_egress_ipv4); nf_unregister_queue_imq_handler(); } static void __exit imq_cleanup_devs(void) { rtnl_link_unregister(&imq_link_ops); memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); } static void __exit imq_exit_module(void) { imq_unhook(); imq_cleanup_devs(); printk(KERN_INFO "IMQ driver unloaded successfully.\n"); } module_init(imq_init_module); module_exit(imq_exit_module); module_param(numdevs, int, 0); MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will " "be created)"); MODULE_AUTHOR("http://www.linuximq.net"); MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See " "http://www.linuximq.net/ for more information."); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("imq");