/* * Packet Accelerator Interface * * vim:set expandtab shiftwidth=3 softtabstop=3: * * Copyright (c) 2011-2020 AVM GmbH * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * Alternatively, this software may be distributed and/or modified under the * terms of the GNU General Public License as published by the Free Software * Foundation. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * PID - pheripheral ID * Identifies a low level device, may be a network driver or * for ATM, every VCC has its own PID * VPID - virtual pheripheral ID * Is assigned to a network device or a virtual network device * * Sessions can have four states: * - FREE : session on sess_list[AVM_PA_LIST_FREE] * - CREATE : session is on no list * - ACTIVE : session on sess_list[AVM_PA_LIST_ACTIVE], in hashtable and not flushed * - FLUSHED : session on sess_list[AVM_PA_LIST_ACTIVE], in hashtable and flushed * - DEAD : session on sess_list[AVM_PA_LIST_DEAD] * * FREE -> pa_session_alloc() -> CREATE * CREATE -> pa_session_activate() -> ACTIVE * ACTIVE -> pa_session_flush() -> FLUSHED * FLUSHED -> pa_session_tick() -> DEAD * DEAD -> pa_session_tick() -> FREE * * pa_session_kill() can transition from any state to DEAD. Use it only if you * know that an immediate GC trigger (that moves from DEAD to FREE) won't be * a problem, otherwise use pa_session_flush() which is safe. In general, * this is only the case when a session wasn't ACTIVE yet (before * pa_session_activate() completes). pa_session_flush() guarantees that at least one * complete GC period happens before a session transitions to FREE. */ #define AVM_PA_FORCE_PRINTK_ENABLED 0 #if AVM_PA_FORCE_PRINTK_ENABLED # ifdef CONFIG_NO_PRINTK # define printk __printk # endif # define DEBUG /* want pr_debug to be compiled in */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Necessary for MIPS Platforms without arch-support for ipv6 chksums */ #include #include #include #include #include #ifdef CONFIG_AVM_POWERMETER #include #endif #ifdef CONFIG_AVM_SIMPLE_PROFILING #include #else #define avm_simple_profiling_skb(a,b) do { } while(0) #endif #include // MODULE_NAME_LEN needed by kallsyms.h (who fails to include himself) #include // sprint_symbol() #include #include #include #include #include #include #include /* ARPHRD_NONE */ #include #include #include #include #include "avm_pa.h" #include "avm_pa_hw.h" #include "avm_pa_intern.h" #ifdef CONFIG_AVM_PA_GENERIC_CT #include "generic_ct/generic_ct_ops.h" #endif #ifdef CONFIG_BLOG #include #include #define BROADCOM_MAX_PRIOS 8 #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #else #error missing NF_CONNTRACK for Broadcom #endif #endif #ifdef CONFIG_AVM_GENERIC_CONNTRACK #warning Please do not use deprecated AVM_GENERIC_CONNTRACK #define SKB_GENERIC_CT(skb) ((skb)->generic_ct) #define SKB_GENERIC_CT_DIR(skb) ((skb)->nfctinfo) #endif #ifdef CONFIG_L2TP #include #include "../l2tp/l2tp_core.h" #endif #include #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) #define INIT_CSD(_csd, _func, _info) {*(_csd) = (call_single_data_t){ .func = (_func), .info = (_info), };} #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) typedef struct call_single_data call_single_data_t; #endif #endif /* ------------------------------------------------------------------------ */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) /* Until 5.0, call_rcu() call_rcu_bh() were semantically different. * Then, 5.0 makde call_rcu_bh() be a wrapper because a refactoring * makde call_rcu() and call_rcu_bh() semantically equivalent. Finally, * 5.1 removed the call_rcu_bh() wrapper for greater good. * * => For older kernels we still need to call the _bh variant */ static inline void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) #define skb_vlan_tag_get_id vlan_tx_tag_get_id #define skb_vlan_tag_get vlan_tx_tag_get #define skb_vlan_tag_present vlan_tx_tag_present #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) static inline void skb_vlan_tag_put(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { #ifdef AVM_PA_SKBUFF_HAS_VLAN_PROTO skb->vlan_proto = vlan_proto; #endif skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci; } #else /* >= 4.0 */ #define skb_vlan_tag_put __vlan_hwaccel_put_tag #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0) static inline void skb_vlan_tag_clear(struct sk_buff *skb) { skb->vlan_tci = 0; } #else /* >= 5.0 */ #define skb_vlan_tag_clear __vlan_hwaccel_clear_tag #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) /* For non-broken smp_call_function_single_async() the following commits are needed: * commit 5224b961 smp: Fix error case handling in smp_call_function_*() * commit 8053871d smp: Fix smp_call_function_single_async() locking * * The commits landed in Linux 4.1. Any older kernel lacking those have a broken * smp_call_function_single_async() and we cannot use RPS (we saw panics every now and then). */ #ifdef CONFIG_AVM_PA_RPS #error Broken smp_call_function_single_async(). Upgrade the kernel, backport 8053871d and 5224b961 or disable CONFIG_AVM_PA_RPS. #endif #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 218) /* Really since 4.11.0. Backported to 4.9.y. * 4.10.y doesn't have it but it's EOL anyway. * * commit 2c935bc572 locking/atomic, kref: Add kref_read() */ #define kref_read(r) atomic_read(&(r)->refcount) #endif #if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 17, 0) /* See mainline commits: * commit 1d023284 list: fix order of arguments for hlist_add_after(_rcu) * * Note that the macro was renamed and arguments order swapped. */ #define hlist_add_behind_rcu(new, prev) hlist_add_after_rcu(prev, new) #endif /* ------------------------------------------------------------------------ */ #if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 0, 0) static inline void skb_reset_mac_len(struct sk_buff *skb) { skb->mac_len = skb->network_header - skb->mac_header; } #endif /* ------------------------------------------------------------------------ */ #define AVM_PA_TRACE 1 /* 0: off */ #define AVM_PA_UNALIGNED_CHECK 0 #define AVM_PA_REF_DEBUG 0 /* 0: off */ #define TX_NAPI_MAXQUEUE 512 #define TX_NAPI_BUDGET 64 #ifndef ETH_P_8021AD #define ETH_P_8021AD 0x88A8 #endif /* ------------------------------------------------------------------------ */ static inline void set_udp_checksum(struct iphdr *iph, struct udphdr *udph) { unsigned short len = ntohs(udph->len); __wsum sum; udph->check = 0; sum = csum_partial((unsigned char *)udph, len, 0); udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, len, IPPROTO_UDP, sum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; } static inline void set_udpv6_checksum(struct ipv6hdr *ipv6h, struct udphdr *udph) { unsigned short len = ntohs(udph->len); __wsum sum; udph->check = 0; sum = csum_partial((unsigned char *)udph, len, 0); udph->check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, len, IPPROTO_UDP, sum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; } /* Return a uniq id for a give skb. Currently it simply * returns its addresses with the always-zero low bits shifted away. */ static inline unsigned long pkt_uniq_id(PKT *pkt) { unsigned long addr = (unsigned long) pkt; unsigned long shift = max(L1_CACHE_SHIFT, 2); return addr >> shift; } /* ------------------------------------------------------------------------ */ static inline int rand(void) { int x; get_random_bytes(&x, sizeof(x)); return x; } #define PKT_DATA(pkt) (pkt)->data /* PKT_LEN has the data in the head skb. For frag_list skbs, this is just L2/3/4 headers * without any payload. For normal skbs it includes the payload after the headers. */ #define PKT_LEN(pkt) (skb_headlen(pkt)) #define PKT_PULL(pkt, len) skb_pull(pkt, len) #define PKT_PUSH(pkt, len) skb_push(pkt, len) #define PKT_FREE(pkt) dev_kfree_skb_any(pkt) #define PKT_COPY(pkt) skb_copy(pkt, GFP_ATOMIC) #ifdef AVM_PA_SKBUFF_HAS_VLAN_PROTO #define PA_VLAN_PROTO(pkt) (pkt)->vlan_proto #else #define PA_VLAN_PROTO(pkt) (constant_htons(ETH_P_8021Q)) #endif static int pa_printk(void *type, const char *format, ...) #ifdef __GNUC__ __attribute__ ((__format__(__printf__, 2, 3))) #endif ; static int pa_printk(void *type, const char *format, ...) { va_list args; int rc; va_start(args, format); if (type) printk("%s", (char *)type); rc = vprintk(format, args); va_end(args); return rc; } /* ------------------------------------------------------------------------ */ #define constant_htons(x) __constant_htons(x) #undef IPPROTO_IPENCAP #define IPPROTO_IPENCAP 4 #ifndef IPPROTO_L2TP #define IPPROTO_L2TP 115 #endif /* * Accelerating of L2TPv3 only works with * pseudowire ethernet or ethernet vlan * and default l2-specific header. */ /* ------------------------------------------------------------------------ */ #define AVM_PA_TICK_RATE (500*HZ/1000) /* 0.5 secs */ #define AVM_PA_LC_TIMEOUT 2 /* secs */ #define AVM_PA_TRAFFIC_IDLE_TBFDISABLE 10 /* secs */ /* ------------------------------------------------------------------------ */ #define AVM_PA_MAX_IRQ_QUEUE_LEN 64 #define AVM_PA_EST_DEFAULT_IDX 0 /* 0 - 5 => 0.25sec - 8sec */ #define AVM_PA_EST_DEFAULT_EWMA_LOG 3 /* 1 - 31 */ #define AVM_PA_PRIOACK_THRESH_PKTS 40 /* wait for X packets to do the TCP-ACK check */ #define AVM_PA_PRIOACK_RATIO 70 /* % of packets have to be TCP-ACKs for positive check */ #define AVM_PA_COUNT_PRIO_MAPS 2 /* tack and tget */ #define AVM_PA_BE_QUEUE 6 /* best-effort queue */ #define AVM_PA_INGRESS_PRIO_NET_MASK 0xFFFF0000U #define AVM_PA_INGRESS_PRIO_HOST_MASK 0x0000FFFFU #define AVM_PA_INGRESS_PRIO_NET(prio) (((prio) & AVM_PA_INGRESS_PRIO_NET_MASK) >> 16) #define AVM_PA_INGRESS_PRIO_HOST(prio) ( (prio) & AVM_PA_INGRESS_PRIO_HOST_MASK) /* ------------------------------------------------------------------------ */ static DEFINE_SPINLOCK(avm_pa_lock); struct avm_pa_est { unsigned idx; unsigned ewma_log; u32 last_packets; u32 avpps; }; struct avm_pa_global { int disabled; int fw_disabled; atomic_t misc_is_open; /* means fw_disabled */ int dbgcapture; int dbgsession; int dbgnosession; int dbgtrace; int dbgmatch; int dbgprioack; int dbgprioacktrace; int dbgstats; bool bsession_allowed; unsigned long tcp_timeout_secs; unsigned long udp_timeout_secs; unsigned long echo_timeout_secs; unsigned long bridge_timeout_secs; struct avm_pa_pid pid_array[CONFIG_AVM_PA_MAX_PID]; struct avm_pa_vpid vpid_array[CONFIG_AVM_PA_MAX_VPID]; struct avm_pa_session_list sess_list[AVM_PA_LIST_MAX]; struct avm_pa_bsession bsess_array[CONFIG_AVM_PA_MAX_SESSION]; struct avm_pa_macaddr macaddr_array[AVM_PA_MAX_MACADDR]; struct hlist_head macaddr_hashtab[AVM_PA_MAX_MACADDR]; struct avm_pa_stats stats, stats_copy; struct hlist_head egress_freelist; atomic_t session_uniq_id; atomic_t ingress_uniq_id; struct timer_list tick_timer; struct sk_buff_head irqqueue; struct tasklet_struct irqtasklet; /* packet rate estimater */ char est_start[0]; int est_idx; int ewma_log; struct timer_list est_timer; struct avm_pa_est rx_est; struct avm_pa_est fw_est; struct avm_pa_est overlimit_est; char est_end[0]; int rps_enabled; #ifdef CONFIG_AVM_PA_RPS #define PA_RPS_REVERSE_SIZE 256 /* 256 * sizeof(struct sk_buff *) */ struct avm_pa_rps { struct sk_buff *q; /* lockless enqueue/dequeue (in reverse order) */ struct sk_buff **r; /* Array for reverse, grows dynamically */ size_t r_sz; struct tasklet_struct dequeue_task; struct tasklet_struct ipi_task; call_single_data_t csd; unsigned long rx_enqueued; unsigned long rx_rps_ipis; unsigned long rx_dequeued; } rps[CONFIG_AVM_PA_RPS_QUEUES]; #endif /* ... */ char tok_start[0]; struct task_struct *tok_task; int tok_pos; #define TOK_SAMLES 64 int tok_state[TOK_SAMLES]; unsigned tok_overtime[TOK_SAMLES]; unsigned tok_rate[TOK_SAMLES]; unsigned tok_pps[TOK_SAMLES]; unsigned long tok_overlimit[TOK_SAMLES]; char tok_end[0]; unsigned prioack_thresh_packets; unsigned prioack_ratio; struct avm_hardware_pa hardware_pa; int hw_ppa_disabled; struct completion *hw_pa_flush_completion; struct kref hw_pa_ref; #ifdef CONFIG_PROC_FS int filter_enabled; struct list_head accel_filter; /* empty to accelerate all sessions (if filter_enabled == 1) */ struct list_head show_filter; /* empty to show all sessions (default) */ #endif } pa_glob = { .disabled = 1, .fw_disabled = 1, .dbgcapture = 0, .dbgsession = 0, .dbgnosession = 0, .dbgtrace = 0, .dbgmatch = 0, .dbgprioack = 0, .dbgprioacktrace = 0, .dbgstats = 0, .bsession_allowed = 1, .tcp_timeout_secs = 10, .udp_timeout_secs = 10, .echo_timeout_secs = 3, .bridge_timeout_secs = 30, .est_idx = AVM_PA_EST_DEFAULT_IDX, .ewma_log = AVM_PA_EST_DEFAULT_EWMA_LOG, .prioack_thresh_packets = AVM_PA_PRIOACK_THRESH_PKTS, .prioack_ratio = AVM_PA_PRIOACK_RATIO, .filter_enabled = 1, #ifdef CONFIG_AVM_PA_RPS .rps_enabled = 1, #endif }; struct avm_pa_data pa_data; #define PA_PID(ctx, handle) (&ctx->pid_array[(handle)%CONFIG_AVM_PA_MAX_PID]) #define PA_VPID(ctx, handle) (&ctx->vpid_array[(handle)%CONFIG_AVM_PA_MAX_VPID]) #define PA_SESSION(pd, handle) (&(pd)->sessions[(handle)%CONFIG_AVM_PA_MAX_SESSION]) #define PA_BSESSION(ctx, handle) (&ctx->bsess_array[(handle)%CONFIG_AVM_PA_MAX_SESSION]) /* ------------------------------------------------------------------------ */ static void pa_session_kill_nolock(struct avm_pa_session *session, const char *why); static void pa_session_kill(struct avm_pa_session *session, const char *why); static void pa_session_flush(struct avm_pa_session *session, const char *why); static int pa_session_handle_stats(struct avm_pa_session *session); static void pa_show_session(struct avm_pa_session *session, pa_fprintf fprintffunc, void *arg); static int avm_pa_pid_receive(avm_pid_handle pid_handle, PKT *pkt); static void avm_pa_flush_sessions_with_destmac(struct avm_pa_macaddr *destmac); static void avm_pa_flush_hw_sessions(void); static inline int avm_pa_pid_tack_enabled(struct avm_pa_pid *pid) { return pid->prio_maps[AVM_PA_PRIO_MAP_TACK].enabled; } static inline int avm_pa_pid_tget_enabled(struct avm_pa_pid *pid) { return pid->prio_maps[AVM_PA_PRIO_MAP_TGET].enabled; } /* * Helper functions to retrieve a valid tack or tget priority from a pid's priority map. * Remember: prio_maps must include the correct TC_H_MAJ part. */ static inline unsigned int avm_pa_pid_tack_prio(struct avm_pa_pid *pid, unsigned int prio) { if (likely((prio & TC_H_MIN_MASK) < AVM_PA_MAX_PRIOS)) return pid->prio_maps[AVM_PA_PRIO_MAP_TACK].prios[prio & TC_H_MIN_MASK]; return 0; } static inline unsigned int avm_pa_pid_tget_prio(struct avm_pa_pid *pid, unsigned int prio) { if (likely((prio & TC_H_MIN_MASK) < AVM_PA_MAX_PRIOS)) return pid->prio_maps[AVM_PA_PRIO_MAP_TGET].prios[prio & TC_H_MIN_MASK]; return prio; } /* ------------------------------------------------------------------------ */ static inline int avm_pa_capture_running(void) { struct avm_pa_global *ctx = &pa_glob; if (ctx->dbgcapture) return 0; return atomic_read(&ctx->misc_is_open); } /* ------------------------------------------------------------------------ */ /* -------- utilities ----------------------------------------------------- */ /* ------------------------------------------------------------------------ */ static const char *egresstype2str(enum avm_pa_egresstype etype) { switch (etype) { case avm_pa_egresstype_output : return "output"; case avm_pa_egresstype_local : return "local"; case avm_pa_egresstype_rtp : return "rtp"; case avm_pa_egresstype_xfrm : return "xfrm"; case avm_pa_egresstype_null : return "null"; } return "???"; } static const char *rc2str(int rc) { switch (rc) { case AVM_PA_RX_BROADCAST : return "is broadcast"; case AVM_PA_RX_TTL : return "ttl/hoplimit <= 1"; case AVM_PA_RX_FRAGMENT : return "is fragment"; case AVM_PA_RX_BYPASS : return "bypass"; case AVM_PA_RX_OK : return "ok"; case AVM_PA_RX_ACCELERATED : return "accelerated"; case AVM_PA_RX_ERROR_STATE : return "state machine problem ?"; case AVM_PA_RX_ERROR_LEN : return "packet too short"; case AVM_PA_RX_ERROR_IPVERSION : return "illegal ip version"; case AVM_PA_RX_ERROR_MATCH : return "too much header"; case AVM_PA_RX_ERROR_HDR : return "too much ip header"; } return "???"; } static const char *framing2str(enum avm_pa_framing framing) { switch (framing) { case avm_pa_framing_ether: return "ether"; case avm_pa_framing_ppp: return "ppp"; case avm_pa_framing_ip: return "ip"; case avm_pa_framing_ipdev: return "ipdev"; case avm_pa_framing_dev: return "dev"; case avm_pa_framing_ptype: return "local"; case avm_pa_framing_llcsnap: return "llcsnap"; } return "undef"; } static int in6_addr2str(const void *cp, char *buf, size_t size) { const struct in6_addr *s = (const struct in6_addr *)cp; return snprintf(buf, size, "%x:%x:%x:%x:%x:%x:%x:%x", ntohs(s->s6_addr16[0]), ntohs(s->s6_addr16[1]), ntohs(s->s6_addr16[2]), ntohs(s->s6_addr16[3]), ntohs(s->s6_addr16[4]), ntohs(s->s6_addr16[5]), ntohs(s->s6_addr16[6]), ntohs(s->s6_addr16[7])); } static int in_addr2str(const void *cp, char *buf, size_t size) { const unsigned char *s = (const unsigned char *)cp; return snprintf(buf, size, "%d.%d.%d.%d", s[0], s[1], s[2], s[3]); } static int mac2str(const void *cp, char *buf, size_t size) { const unsigned char *mac = (const unsigned char *)cp; return snprintf(buf, size, "%02X:%02X:%02X:%02X:%02X:%02X", mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } static const char *pkttype2str(u16 pkttype, char *buf, size_t size) { char *p = buf; char *end = p + size; if (pkttype == AVM_PA_PKTTYPE_NONE) { snprintf(p, end-p, "none"); return buf; } switch (pkttype & AVM_PA_PKTTYPE_IPENCAP_MASK) { case AVM_PA_PKTTYPE_IPV6ENCAP: snprintf(p, end-p, "IPv6+"); p += strlen(p); break; case AVM_PA_PKTTYPE_IPV4ENCAP: snprintf(p, end-p, "IPv4+"); p += strlen(p); break; } if (pkttype & AVM_PA_PKTTYPE_LISP) { snprintf(p, end-p, "LISP+"); p += strlen(p); } if (pkttype & AVM_PA_PKTTYPE_L2TP) { snprintf(p, end-p, "L2TPv3+"); p += strlen(p); } if (pkttype & AVM_PA_PKTTYPE_GRE) { snprintf(p, end-p, "GRE+"); p += strlen(p); } switch (pkttype & AVM_PA_PKTTYPE_IP_MASK) { case AVM_PA_PKTTYPE_IPV6: snprintf(p, end-p, "IPv6"); p += strlen(p); break; case AVM_PA_PKTTYPE_IPV4: snprintf(p, end-p, "IPv4"); p += strlen(p); break; } if (AVM_PA_PKTTYPE_IPPROTO(pkttype)) { switch (AVM_PA_PKTTYPE_IPPROTO(pkttype)) { case IPPROTO_UDP: snprintf(p, end-p, "+UDP"); break; case IPPROTO_TCP: snprintf(p, end-p, "+TCP"); break; case IPPROTO_ICMP: snprintf(p, end-p, "+ICMP"); break; case IPPROTO_ICMPV6: snprintf(p, end-p, "+ICMPV6"); break; case IPPROTO_L2TP: snprintf(p, end-p, "+L2TPv3"); break; case IPPROTO_ESP: snprintf(p, end-p, "+ESP"); break; default: snprintf(p, end-p, "+P%u", AVM_PA_PKTTYPE_IPPROTO(pkttype)); break; } } return buf; } static char *data2hex(void *data, int datalen, char *buf, int bufsiz) { static char hexchars[] = "0123456789ABCDEF"; unsigned char *databuf = (unsigned char *)data; char *s = buf; char *end = buf+bufsiz; int i; snprintf(s, end-s, "%d: ", datalen); s += strlen(s); for (i=0; i < datalen && s + 3 < end; i ++) { *s++ = hexchars[(databuf[i] >> 4) & 0xf]; *s++ = hexchars[databuf[i] & 0xf]; } *s = 0; return buf; } static char *pidflags2str(unsigned long flags, char *buf, int bufsiz) { char *s = buf; char *end = s + bufsiz; buf[0] = 0; if (flags & AVM_PA_PID_FLAG_NO_PID_CHANGED_CHECK) { snprintf(s, end-s, "%sno_pid_changed_check", s == buf ? "" : ","); s += strlen(s); } if (flags & AVM_PA_PID_FLAG_HSTART_ON_INGRESS) { snprintf(s, end-s, "%shstart_on_ingress", s == buf ? "" : ","); s += strlen(s); } if (flags & AVM_PA_PID_FLAG_HSTART_ON_EGRESS) { snprintf(s, end-s, "%shstart_on_egress", s == buf ? "" : ","); s += strlen(s); } if (s == buf) snprintf(s, end-s, "none"); return buf; } /* ------------------------------------------------------------------------ */ /* -------- l2tp session cache -------------------------------------------- */ /* ------------------------------------------------------------------------ */ static struct avm_pa_l2tp * pa_l2tp_session_search(__be32 session_id) { #ifdef CONFIG_L2TP struct avm_pa_data *pd = &pa_data; int i; for (i = 0; i < ARRAY_SIZE(pd->l2tp_cache); i++) { if (pd->l2tp_cache[i].session_id == session_id) return &pd->l2tp_cache[i]; } #endif return NULL; } static struct avm_pa_l2tp * pa_l2tp_session_search_by_peer(__be32 peer_session_id) { #ifdef CONFIG_L2TP struct avm_pa_data *pd = &pa_data; int i; for (i = 0; i < ARRAY_SIZE(pd->l2tp_cache); i++) { if (pd->l2tp_cache[i].peer_session_id == peer_session_id) return &pd->l2tp_cache[i]; } #endif return NULL; } #ifdef CONFIG_L2TP static struct l2tp_session * pa_l2tp_session_get_local(__be32 session_id) { if (in_irq()) return NULL; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) return l2tp_session_get(&init_net, ntohl(session_id)); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) || defined(AVM_L2TP_BACKPORT_4_15) return l2tp_session_get(&init_net, NULL, ntohl(session_id)); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 225) /* Instead of error prone ifdefs, we simply do not support kernels 4.5-4.8, * l2tp_session_get() is normally available since 4.9 onwards. */ return l2tp_session_get(&init_net, NULL, ntohl(session_id), true); #else return l2tp_session_find(&init_net, NULL, ntohl(session_id)); #endif } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 225) #define pa_l2tp_session_put_local(s) l2tp_session_dec_refcount(s) #else /* no-op since this kernel uses l2tp_session_find() w/o refcounting */ #define pa_l2tp_session_put_local(s) #endif #endif static struct avm_pa_l2tp * pa_l2tp_session_alloc(__be32 session_id) { struct avm_pa_l2tp *l2tp = NULL; #ifdef CONFIG_L2TP struct avm_pa_data *pd = &pa_data; struct l2tp_session *local_sess; int i; local_sess = pa_l2tp_session_get_local(session_id); if (local_sess) { /* Add to the cache */ spin_lock(&avm_pa_lock); for (i = 0; i < ARRAY_SIZE(pd->l2tp_cache); i++) { if (pd->l2tp_cache[i].session_id == 0) { l2tp = &pd->l2tp_cache[i]; l2tp->session_id = session_id; /* store so that we can also look up by peer_session_id * for ingress packets */ l2tp->peer_session_id = htonl(local_sess->peer_session_id); l2tp->hdr_len = local_sess->hdr_len; break; } } spin_unlock(&avm_pa_lock); pa_l2tp_session_put_local(local_sess); } #endif return l2tp; } /* ------------------------------------------------------------------------ */ /* -------- parsing of packets -------------------------------------------- */ /* ------------------------------------------------------------------------ */ #define LISPDATAHDR(info) (HDRCOPY(info)+(info)->lisp_offset) static inline void pa_reset_match(struct avm_pa_pkt_match *info) { info->nmatch = 0; info->casttype = AVM_PA_IS_UNICAST; info->pkttype = AVM_PA_PKTTYPE_NONE; info->pppoe_offset = AVM_PA_OFFSET_NOT_SET; info->encap_offset = AVM_PA_OFFSET_NOT_SET; info->lisp_offset = AVM_PA_OFFSET_NOT_SET; info->ip_offset = AVM_PA_OFFSET_NOT_SET; info->l4_offset = AVM_PA_OFFSET_NOT_SET; info->hdroff = 0; info->hdrlen = 0; info->full_hdrlen = 0; info->vlan_tci = 0; info->vlan_proto = 0; } static inline struct avm_pa_match_info * pa_find_eth_match(struct avm_pa_pkt_match *match) { struct avm_pa_match_info *p, *end = &match->match[match->nmatch]; for (p = &match->match[0]; p != end; p++) { if (p->type == AVM_PA_ETH) { return p; } } return NULL; /* no ETH found */ } static inline void pa_change_to_bridge_match(struct avm_pa_pkt_match *match) { struct avm_pa_match_info *p; p = pa_find_eth_match(match); if (p) { if ((p + 1)->type == AVM_PA_VLAN) ++p; match->nmatch = p - match->match + 1; } } static inline int pa_add_match(struct avm_pa_pkt_match *info, unsigned char offset, unsigned char type) { if (info->nmatch < AVM_PA_MAX_MATCH) { info->match[info->nmatch].offset = offset; info->match[info->nmatch].type = type; info->nmatch++; return 0; } return -1; } static int set_pkt_match(enum avm_pa_framing framing, unsigned int hstart, PKT *pkt, struct avm_pa_pkt_match *info, int on_egress) { #define RETURN(retval) do { ret = retval; goto out; } while (0) int ret = AVM_PA_RX_ERROR_LEN; int state = 0; u8 *data, *p, *end; u32 daddr; u16 ethproto = 0; u16 ipproto = 0; int ttl = 0; int full_hdrlen = 0; data = PKT_DATA(pkt); end = data + PKT_LEN(pkt); data += hstart; switch (framing) { case avm_pa_framing_ip: if ((data[0] & 0xf0) == 0x40 && (data[0] & 0x0f) >= 5) { state = AVM_PA_IPV4; break; } if ((data[0] & 0xf0) == 0x60) { state = AVM_PA_IPV6; break; } return AVM_PA_RX_ERROR_IPVERSION; case avm_pa_framing_ppp: state = AVM_PA_PPP; break; case avm_pa_framing_ether: state = AVM_PA_ETH; break; case avm_pa_framing_dev: data = (u8 *)eth_hdr(pkt); state = AVM_PA_ETH; break; case avm_pa_framing_ipdev: case avm_pa_framing_ptype: data = (u8 *)skb_network_header(pkt); if (pkt->protocol == constant_htons(ETH_P_IP)) { state = AVM_PA_IPV4; } else if (pkt->protocol == constant_htons(ETH_P_IPV6)) { state = AVM_PA_IPV6; } else { return AVM_PA_RX_BYPASS; } break; case avm_pa_framing_llcsnap: state = AVM_PA_LLC_SNAP; break; } if (end - data > AVM_PA_MAX_HEADER - AVM_PA_MAX_HDROFF) end = data + AVM_PA_MAX_HEADER - AVM_PA_MAX_HDROFF; p = data; while (p < end) { hdrunion_t *hdr = (hdrunion_t *)p; int offset = p-data; switch (state) { case AVM_PA_ETH: if (pa_add_match(info, offset, AVM_PA_ETH) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct ethhdr); if (hdr->ethh.h_dest[0] & 1) { if (hdr->ethh.h_dest[0] == 0xff) { info->casttype = AVM_PA_IS_BROADCAST; RETURN(AVM_PA_RX_BYPASS); } else { info->casttype = AVM_PA_IS_MULTICAST; } } if (skb_vlan_tag_present(pkt)) { info->vlan_tci = skb_vlan_tag_get(pkt); info->vlan_proto = PA_VLAN_PROTO(pkt); if (pa_add_match(info, AVM_PA_OFFSET_NOT_SET, AVM_PA_VLAN) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); } state = AVM_PA_ETH_PROTO; ethproto = hdr->ethh.h_proto; continue; case AVM_PA_VLAN: /* This handles only in-band vlan */ if (pa_add_match(info, offset, AVM_PA_VLAN) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct vlanhdr); state = AVM_PA_ETH_PROTO; ethproto = hdr->vlanh.vlan_proto; continue; case AVM_PA_ETH_PROTO: switch (ethproto) { case constant_htons(ETH_P_PPP_SESS): state = AVM_PA_PPPOE; continue; case constant_htons(ETH_P_IP): state = AVM_PA_IPV4; continue; case constant_htons(ETH_P_IPV6): state = AVM_PA_IPV6; continue; case constant_htons(ETH_P_8021Q): case constant_htons(ETH_P_8021AD): state = AVM_PA_VLAN; continue; } RETURN(AVM_PA_RX_BYPASS); case AVM_PA_PPPOE: if (pa_add_match(info, offset, AVM_PA_PPPOE) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct pppoehdr); info->pppoe_offset = offset; state = AVM_PA_PPP; continue; case AVM_PA_PPP: if (p[0] == 0) { p++; offset++; } if (p[0] == 0x21) { if (pa_add_match(info, offset, AVM_PA_PPP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p++; state = AVM_PA_IPV4; continue; } if (p[0] == 0x57) { if (pa_add_match(info, offset, AVM_PA_PPP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p++; state = AVM_PA_IPV6; continue; } RETURN(AVM_PA_RX_BYPASS); case AVM_PA_IPV4: if (hdr->iph.version != 4) RETURN(AVM_PA_RX_ERROR_IPVERSION); if (pa_add_match(info, offset, AVM_PA_IPV4) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); ttl = hdr->iph.ttl; p += PA_IPHLEN(&hdr->iph); if (hdr->iph.frag_off & constant_htons(IP_OFFSET)) RETURN(AVM_PA_RX_FRAGMENT); /* We don't support forwarding fragments, we may only create them for * tunnels, so check if we're on egress. */ if ((hdr->iph.frag_off & constant_htons(IP_MF)) && !on_egress) RETURN(AVM_PA_RX_FRAGMENT); daddr = get_unaligned(&hdr->iph.daddr); if (ipv4_is_lbcast(daddr)) { info->casttype = AVM_PA_IS_BROADCAST; RETURN(AVM_PA_RX_BYPASS); } else if (ipv4_is_multicast(daddr)) { info->casttype = AVM_PA_IS_MULTICAST; } if (hdr->iph.protocol == IPPROTO_IPV6) { if (info->pkttype != AVM_PA_PKTTYPE_NONE) RETURN(AVM_PA_RX_ERROR_HDR); info->pkttype |= AVM_PA_PKTTYPE_IPV4ENCAP; info->encap_offset = offset; state = AVM_PA_IPV6; continue; } if (hdr->iph.protocol == IPPROTO_IPENCAP) { if (info->pkttype != AVM_PA_PKTTYPE_NONE) RETURN(AVM_PA_RX_ERROR_HDR); info->pkttype |= AVM_PA_PKTTYPE_IPV4ENCAP; info->encap_offset = offset; state = AVM_PA_IPV4; continue; } info->pkttype |= AVM_PA_PKTTYPE_IPV4; info->ip_offset = offset; state = AVM_PA_IP_PROTO; ipproto = hdr->iph.protocol; if ((offset & 0x3) && info->hdroff == 0) info->hdroff = 4 - (offset & 0x3); continue; case AVM_PA_IPV6: if (hdr->ipv6h.version != 6) RETURN(AVM_PA_RX_ERROR_IPVERSION); if (pa_add_match(info, offset, AVM_PA_IPV6) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); ttl = hdr->ipv6h.hop_limit; p += sizeof(struct ipv6hdr); if (hdr->ipv6h.daddr.s6_addr[0] == 0xff) info->casttype = AVM_PA_IS_MULTICAST; if (hdr->ipv6h.nexthdr == IPPROTO_IPV6) { if (info->pkttype != AVM_PA_PKTTYPE_NONE) RETURN(AVM_PA_RX_ERROR_HDR); info->pkttype |= AVM_PA_PKTTYPE_IPV6ENCAP; info->encap_offset = offset; state = AVM_PA_IPV6; continue; } if (hdr->ipv6h.nexthdr == IPPROTO_IPENCAP) { if (info->pkttype != AVM_PA_PKTTYPE_NONE) RETURN(AVM_PA_RX_ERROR_HDR); info->pkttype |= AVM_PA_PKTTYPE_IPV6ENCAP; info->encap_offset = offset; state = AVM_PA_IPV4; continue; } if (hdr->ipv6h.nexthdr == IPPROTO_FRAGMENT) { struct ipv6fraghdr *fragh = (struct ipv6fraghdr *)p; info->pkttype |= AVM_PA_PKTTYPE_IPV6; info->ip_offset = offset; if (fragh->frag_off & constant_htons(IP6_OFFSET)) RETURN(AVM_PA_RX_FRAGMENT); /* We don't support forwarding fragments, we may only create them for * tunnels, so check if we're on egress. */ if ((fragh->frag_off & constant_htons(IP6_MF)) && !on_egress) RETURN(AVM_PA_RX_FRAGMENT); p += sizeof(struct ipv6fraghdr); state = AVM_PA_IP_PROTO; ipproto = fragh->nexthdr; continue; } info->pkttype |= AVM_PA_PKTTYPE_IPV6; info->ip_offset = offset; state = AVM_PA_IP_PROTO; ipproto = hdr->ipv6h.nexthdr; if ((offset & 0x3) && info->hdroff == 0) info->hdroff = 4 - (offset & 0x3); continue; case AVM_PA_IP_PROTO: switch (ipproto) { case IPPROTO_TCP: if (p + sizeof(struct tcphdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); if (pa_add_match(info, offset, AVM_PA_PORTS) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); /* Only ports are stored */ full_hdrlen = (p - data) + sizeof(struct tcphdr); p += 2 * sizeof(__be16); info->pkttype |= ipproto; info->l4_offset = offset; RETURN(AVM_PA_RX_OK); case IPPROTO_UDP: if (p + sizeof(struct udphdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); if (pa_add_match(info, offset, AVM_PA_PORTS) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); if (hdr->udph.dest == constant_htons(4341)) { p += sizeof(struct udphdr); state = AVM_PA_LISP; continue; } else if (hdr->udph.dest == constant_htons(67)) { if (AVM_PA_PKTTYPE_IP_VERSION(info->pkttype) == 4) { /* We don't accelerate DHCPv4 as it turned out to be * problematic. Local DHCP daemons must listen * on a raw socket in addition to datagram sockets * because clients usually have the source address * of 0.0.0.0. We cannot serve raw sockets once a local * session exists so the local daemons miss packets. * * DHCP is not worthwhile to accelerate anyway * - low traffic * - often broadcast * * Since no other protocol requires two sockets it's * not worth it to implement raw socket support, therefore * just don't create sessions. * * Fixes JZ-25001 and JZ-94510. */ p += sizeof(struct udphdr); RETURN(AVM_PA_RX_BYPASS); } } /* Only ports are stored */ full_hdrlen = (p - data) + sizeof(struct udphdr); p += 2 * sizeof(__be16); info->pkttype |= ipproto; info->l4_offset = offset; RETURN(AVM_PA_RX_OK); case IPPROTO_ICMP: if (p + sizeof(struct icmphdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); if ( hdr->icmph.type != ICMP_ECHO && hdr->icmph.type != ICMP_ECHOREPLY) RETURN(AVM_PA_RX_BYPASS); if (pa_add_match(info, offset, AVM_PA_ICMPV4) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct icmphdr); info->pkttype |= ipproto; info->l4_offset = offset; RETURN(AVM_PA_RX_OK); case IPPROTO_ICMPV6: if (p + sizeof(struct icmp6hdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); if ( hdr->icmpv6h.icmp6_type != ICMPV6_ECHO_REQUEST && hdr->icmpv6h.icmp6_type != ICMPV6_ECHO_REPLY) RETURN(AVM_PA_RX_BYPASS); if (pa_add_match(info, offset, AVM_PA_ICMPV6) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct icmp6hdr); info->pkttype |= ipproto; info->l4_offset = offset; RETURN(AVM_PA_RX_OK); case IPPROTO_L2TP: if (AVM_PA_PKTTYPE_IPENCAP_VERSION(info->pkttype)) RETURN(AVM_PA_RX_OK); if (pa_add_match(info, offset, AVM_PA_L2TP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); { struct avm_pa_l2tp *l2tp = NULL; __be32 be_session_id = hdr->l2tp.session_id; /* check the system has configured sessions... * yes: we check and use the sessions offset (start of eth header) * no: we terminate classification, probably l2tp pass through. */ if (be_session_id != 0) { if (on_egress) { l2tp = pa_l2tp_session_search_by_peer(be_session_id); } else { l2tp = pa_l2tp_session_search(be_session_id); if (!l2tp) l2tp = pa_l2tp_session_alloc(be_session_id); /* fails inside irq */ } } if (l2tp) { if (p + l2tp->hdr_len > end) RETURN(AVM_PA_RX_ERROR_LEN); p += l2tp->hdr_len; info->encap_offset = info->ip_offset; info->pkttype = AVM_PA_PKTTYPE_IP2IPENCAP_VERSION(info->pkttype); info->pkttype |= AVM_PA_PKTTYPE_L2TP; state = AVM_PA_ETH; } else { /* We are in irq context or cache is filled, * or there is no local l2tp session, i.e. pass through. * * We cannot know for sure as long as we might be in * irq context, but we assume pass through and figure * out later whether to add a session. * * Control connections are treated as pass through here * but effectively they won't be accelerated because precheck * on egress always fails (if they terminate locally). */ AVM_PKT_INFO(pkt)->l2tp_session_id = be_session_id; if (p + sizeof(__be32) > end) RETURN(AVM_PA_RX_ERROR_LEN); p += sizeof(__be32); info->pkttype |= ipproto; info->l4_offset = offset; RETURN(AVM_PA_RX_OK); } } continue; case IPPROTO_GRE: if (AVM_PA_PKTTYPE_IPENCAP_VERSION(info->pkttype)) RETURN(AVM_PA_RX_OK); if (p + sizeof(struct tlb_grehdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); p += sizeof(struct tlb_grehdr); if (pa_add_match(info, offset, AVM_PA_GRE) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); info->encap_offset = info->ip_offset; info->pkttype = AVM_PA_PKTTYPE_IP2IPENCAP_VERSION(info->pkttype); info->pkttype |= AVM_PA_PKTTYPE_GRE; switch (hdr->greh.protocol) { case constant_htons(ETH_P_IP): state = AVM_PA_IPV4; continue; case constant_htons(ETH_P_TEB): state = AVM_PA_ETH; continue; } break; case IPPROTO_ESP: if (p + sizeof(struct ip_esp_hdr) > end) RETURN(AVM_PA_RX_ERROR_LEN); if (pa_add_match(info, offset, AVM_PA_ESP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct ip_esp_hdr); /* Only supporting pass-through... */ info->pkttype |= ipproto; info->l4_offset = offset; /* Encrypted payload follows, terminate parsing. */ RETURN(AVM_PA_RX_OK); } RETURN(AVM_PA_RX_BYPASS); case AVM_PA_LLC_SNAP: if ( hdr->llcsnap.dsap != 0xAA || hdr->llcsnap.ssap != 0xAA || hdr->llcsnap.ui != 0x03) /* not checking: * RFC1042_SNAP 0x00,0x00,0x00 * BTEP_SNAP 0x00,0x00,0xf8 */ RETURN(AVM_PA_RX_BYPASS); if (pa_add_match(info, offset, AVM_PA_LLC_SNAP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); p += sizeof(struct llc_snap_hdr); state = AVM_PA_ETH_PROTO; ethproto = get_unaligned(&hdr->llcsnap.type); continue; case AVM_PA_LISP: if (AVM_PA_PKTTYPE_IPENCAP_VERSION(info->pkttype)) RETURN(AVM_PA_RX_OK); if (p + LISP_DATAHDR_SIZE > end) RETURN(AVM_PA_RX_ERROR_LEN); p += LISP_DATAHDR_SIZE; hdr = (hdrunion_t *)p; if (hdr->iph.version == 4) state = AVM_PA_IPV4; else if (hdr->iph.version == 6) state = AVM_PA_IPV6; else RETURN(AVM_PA_RX_OK); /* not a lisp packet */ if (pa_add_match(info, offset, AVM_PA_LISP) < 0) RETURN(AVM_PA_RX_ERROR_MATCH); info->lisp_offset = offset; info->encap_offset = info->ip_offset; info->pkttype = AVM_PA_PKTTYPE_IP2IPENCAP_VERSION(info->pkttype); info->pkttype |= AVM_PA_PKTTYPE_LISP; continue; default: RETURN(AVM_PA_RX_ERROR_STATE); } } out: if (ret == AVM_PA_RX_OK && ttl == 0) ret = AVM_PA_RX_TTL; if (ret == AVM_PA_RX_OK && (p - data) > AVM_PA_MAX_HEADER) ret = AVM_PA_RX_ERROR_LEN; if (ret == AVM_PA_RX_OK || pa_glob.dbgmatch) { info->protocol = pkt->protocol; info->hdrlen = p - data; memcpy(HDRCOPY(info), data, info->hdrlen); } if (ret == AVM_PA_RX_OK) { info->full_hdrlen = full_hdrlen ? full_hdrlen : info->hdrlen; } return ret; #undef RETURN } static inline bool pa_match_is_tcp_nodata(struct avm_pa_pkt_match *match, char *head) { struct tcphdr *tcph = (struct tcphdr *) (head + match->l4_offset); struct iphdr *iph = (struct iphdr *) (head + match->ip_offset); if (AVM_PA_PKTTYPE_IP_VERSION(match->pkttype) == 4) return ntohs(PA_IPTOTLEN(iph)) == (PA_IPHLEN(iph)+PA_TCP_DOFF(tcph)); else if (AVM_PA_PKTTYPE_IP_VERSION(match->pkttype) == 6) return ntohs(PA_IP6_PAYLOADLEN(iph)) == PA_TCP_DOFF(tcph); else return 0; } static inline void pa_match_postprocess(struct avm_pa_pkt_match *info) { int i; info->hash = 0; for (i = 0 ; i < info->nmatch; i++) { struct avm_pa_match_info *p = &info->match[i]; hdrunion_t *hdr = (hdrunion_t *)(HDRCOPY(info)+p->offset); switch (p->type) { case AVM_PA_IPV4: #if AVM_PA_UNALIGNED_CHECK if (((unsigned long)&hdr->iph.saddr) & 0x3) if (net_ratelimit()) pr_info("avm_pa: unaligned access %p (ipv4)\n", &hdr->iph.saddr); #endif info->hash ^= hdr->iph.saddr; info->hash ^= hdr->iph.daddr; info->hash ^= hdr->iph.protocol; info->hash ^= hdr->iph.tos; /* * JZ-36233: Gastzugang auf dem Repeater * * A session may be created by a packet with IP_MF set. If this * header is going to be pushed as-is on egress (e.g. in case of L2TP * encap) all packets would have IP_MF set, so we need to reset frag_off. * * TODO: What about IPv6? */ hdr->iph.frag_off = 0; break; case AVM_PA_IPV6: #if AVM_PA_UNALIGNED_CHECK if (((unsigned long)&hdr->ipv6h.saddr.s6_addr32[2]) & 0x3) if (net_ratelimit()) pr_info("avm_pa: unaligned access %p (ipv6)\n", &hdr->ipv6h.saddr.s6_addr32[2]); #endif //info->hash ^= hdr->ipv6h.saddr.s6_addr32[0]; //info->hash ^= hdr->ipv6h.saddr.s6_addr32[1]; info->hash ^= hdr->ipv6h.saddr.s6_addr32[2]; info->hash ^= hdr->ipv6h.saddr.s6_addr32[3]; //info->hash ^= hdr->ipv6h.daddr.s6_addr32[0]; //info->hash ^= hdr->ipv6h.daddr.s6_addr32[1]; info->hash ^= hdr->ipv6h.daddr.s6_addr32[2]; info->hash ^= hdr->ipv6h.daddr.s6_addr32[3]; info->hash ^= hdr->ipv6h.nexthdr; /* hash prio and flow label (plus constant version 6) */ info->hash ^= hdr->ipv6_vpfl; break; case AVM_PA_PORTS: /* At least Linux seems to prefer even ports when selecting source ports, * for RPS we want the lowest bits of the hash to be most significant */ info->hash ^= ror16(hdr->ports[0], 1); info->hash ^= ror16(hdr->ports[1], 1); break; case AVM_PA_ICMPV4: case AVM_PA_ICMPV6: info->hash ^= hdr->ports[0]; /* type + code */ info->hash ^= hdr->ports[2]; /* id */ break; case AVM_PA_ESP: info->hash ^= hdr->esph.spi; break; } } info->hash = (info->hash >> 16) ^ (info->hash & 0xffff); info->hash = (info->hash >> 8) ^ (info->hash & 0xff); info->hash %= CONFIG_AVM_PA_MAX_SESSION; } static int pa_set_pkt_match(enum avm_pa_framing framing, unsigned int hstart, PKT *pkt, struct avm_pa_pkt_match *match, int on_egress) { int rc; pa_reset_match(match); rc = set_pkt_match(framing, hstart, pkt, match, on_egress); if (rc == AVM_PA_RX_OK) pa_match_postprocess(match); return rc; } /* Compare two packet matches. A slice can be selected by skipping * the first few match info items, for example to only compare the * L3 part of the packet match. */ static inline int pa_match_cmp(struct avm_pa_pkt_match *a1, int a1_skip, struct avm_pa_pkt_match *a2, int a2_skip) { struct avm_pa_match_info *p; hdrunion_t *h1, *h2; int rc; int i; int a1_nmatch = a1->nmatch - a1_skip; int a2_nmatch = a2->nmatch - a2_skip; /* The match item count must be equal. */ if ((rc = a1_nmatch - a2_nmatch)) goto out; /* The match slice itself must be equal. */ if ((rc = memcmp(&a1->match[a1_skip], &a2->match[a2_skip], a1_nmatch*sizeof(struct avm_pa_match_info)))) goto out; /* From here now we determined that the slice is the same, therefore we only * use match items from a1 going forward, to test how the relevant fields in * the hdrcopy compare. */ for (i = a1->nmatch-1; i >= a1_skip; i--) { p = &a1->match[i]; /* h1 and h2 must NOT be used if p->offset is AVM_PA_OFFSET_NOT_SET. * At this time, AVM_PA_OFFSET_NOT_SET is only possible for type == AVM_PA_VLAN */ h1 = (hdrunion_t *)(HDRCOPY(a1)+p->offset); h2 = (hdrunion_t *)(HDRCOPY(a2)+p->offset); switch (p->type) { case AVM_PA_ETH: rc = memcmp(&h1->ethh, &h2->ethh, sizeof(struct ethhdr)); if (rc) goto out; break; case AVM_PA_VLAN: if (p->offset == AVM_PA_OFFSET_NOT_SET) rc = a1->vlan_tci ^ a2->vlan_tci; else rc = h1->vlanh.vlan_tci ^ h2->vlanh.vlan_tci; if (rc) goto out; break; case AVM_PA_PPPOE: rc = (int)h1->pppoeh.sid - (int)h2->pppoeh.sid; if (rc) goto out; break; case AVM_PA_PPP: rc = (int)h1->ppph[0] - (int)h2->ppph[0]; if (rc) goto out; break; case AVM_PA_IPV4: rc = (int)h1->iph.protocol - (int)h2->iph.protocol; if (rc) goto out; rc = (int)h1->iph.tos - (int)h2->iph.tos; if (rc) goto out; /* JZ-47728: Windows ICMP has always the same id so ttl * is the only difference between ping and tracert packets * Also, ttl == 1 must not match existing sessions with higher ttl. */ rc = (int)h1->iph.ttl - (int)h2->iph.ttl; if (rc) goto out; rc = (int)h1->iph.daddr - (int)h2->iph.daddr; if (rc) goto out; rc = (int)h1->iph.saddr - (int)h2->iph.saddr; if (rc) goto out; break; case AVM_PA_IPV6: /* compares priority and flow label in one op (version is always 6) */ rc = h1->ipv6_vpfl - h2->ipv6_vpfl; if (rc) goto out; rc = (int)h1->ipv6h.nexthdr - (int)h2->ipv6h.nexthdr; if (rc) goto out; rc = (int)h1->ipv6h.hop_limit - (int)h2->ipv6h.hop_limit; if (rc) goto out; /* compare both src and dst in a single call */ rc = memcmp(&h1->ipv6h.saddr, &h2->ipv6h.saddr, sizeof(struct in6_addr) * 2); if (rc) goto out; break; case AVM_PA_PORTS: rc = (int)h1->ports[0] - (int)h2->ports[0]; /* source */ if (rc) goto out; rc = (int)h1->ports[1] - (int)h2->ports[1]; /* dest */ if (rc) goto out; break; case AVM_PA_ICMPV4: case AVM_PA_ICMPV6: rc = (int)h1->ports[0] - (int)h2->ports[0]; /* type + code */ if (rc) goto out; rc = (int)h1->ports[2] - (int)h2->ports[2]; /* id */ if (rc) goto out; break; case AVM_PA_LLC_SNAP: rc = (int)h1->llcsnap.type - (int)h2->llcsnap.type; if (rc) goto out; break; case AVM_PA_L2TP: rc = (int)h1->l2tp.session_id - (int)h2->l2tp.session_id; if (rc) goto out; break; case AVM_PA_GRE: rc = (int)h1->greh.protocol - (int)h2->greh.protocol; if (rc) goto out; break; case AVM_PA_ESP: rc = (int)h1->esph.spi - (int)h2->esph.spi; if (rc) goto out; break; } } out: return rc; } static inline int pa_match_eq(struct avm_pa_pkt_match *a1, struct avm_pa_pkt_match *a2) { return pa_match_cmp(a1, 0, a2, 0) == 0; } /* Returns 1 if two matches are compatible for bridging. * * This is basically the same as pa_match_eq, except vlan is not considered, since * a bsession can cross VLANs (provided that no modifications need to be done * to the packet data and that the system's bridge setup allows that). */ static inline int pa_match_bridged(struct avm_pa_pkt_match *a1, struct avm_pa_pkt_match *a2) { struct avm_pa_match_info *p1, *p2; hdrunion_t *h1, *h2; if (!(p1 = pa_find_eth_match(a1))) return 0; if (!(p2 = pa_find_eth_match(a2))) return 0; h1 = (hdrunion_t *)(HDRCOPY(a1)+p1->offset); h2 = (hdrunion_t *)(HDRCOPY(a2)+p2->offset); /* MAC addresses must be equal. */ if (memcmp(&h1->ethh, &h2->ethh, ETH_ALEN * 2)) return 0; /* Different VLANs is OK, even the VID may differ. So just skip the VLAN match */ /* JZ-63724: ...but only if there's no in-band VLAN header stored in skb->data */ if ((++p1)->type == AVM_PA_VLAN && p1->offset == AVM_PA_OFFSET_NOT_SET) ++p1; if ((++p2)->type == AVM_PA_VLAN && p2->offset == AVM_PA_OFFSET_NOT_SET) ++p2; /* Compare the remainder for equality which ensures that modifications * to the packet data are not permitted. */ return pa_match_cmp(a1, p1 - a1->match, a2, p2 - a2->match) == 0; } /* ------------------------------------------------------------------------ */ /* -------- mod rec ------------------------------------------------------- */ /* ------------------------------------------------------------------------ */ /* * From RFC 1624 Incremental Internet Checksum * * HC - old checksum in header * HC' - new checksum in header * m - old value of a 16-bit field * m' - new value of a 16-bit field * HC' = ~(~HC + ~m + m') -- [Eqn. 3] * HC' = HC - ~m - m' -- [Eqn. 4] * * * csum_unfold(): be16 -> u32 * * M = ~m + m'; * * we use Eqn.3, because we precalculate M. * csum_fold(): add the carries * * HC' = ~csum_fold((~csum_unfold(HC) + ~m + m')); * * HC' = ~csum_fold(csum_add(~csum_unfold(HC), M); * */ static inline u32 hcsum_add(u32 sum, u32 addend) { sum += addend; if (sum < addend) sum++; /* skip -0 */ return sum; // + (sum < addend); } static inline u32 hcsum_prepare(u16 sum) { return (u16)(~sum); } static inline u32 hcsum_u32(u32 sum, u32 from, u32 to) { sum = hcsum_add(sum, ~from); sum = hcsum_add(sum, to); return sum; } static inline u32 hcsum_u16(u32 sum, u16 from, u16 to) { sum = hcsum_u32(sum, from, to); return sum; } static inline u16 hcsum_fold(u32 sum) { while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); return sum; } static inline u16 hcsum_finish(u32 sum) { return ~hcsum_fold(sum); } static int pa_set_v4_mod_rec(struct avm_pa_v4_mod_rec *mod, int update_ttl, u8 *in, u8 *out) { struct iphdr *iiph = (struct iphdr *)in; struct iphdr *oiph = (struct iphdr *)out; u32 l3_check = 0; u32 l4_check; int isicmp = 0; u16 modflags = 0; mod->saddr = oiph->saddr; if (iiph->saddr != oiph->saddr) { modflags |= AVM_PA_MOD_SADDR|AVM_PA_MOD_IP4_CSUM; l3_check = hcsum_u32(l3_check, iiph->saddr, oiph->saddr); } mod->daddr = oiph->daddr; if (iiph->daddr != oiph->daddr) { modflags |= AVM_PA_MOD_DADDR|AVM_PA_MOD_IP4_CSUM; l3_check = hcsum_u32(l3_check, iiph->daddr, oiph->daddr); } l4_check = l3_check; mod->tos = oiph->tos; if (iiph->tos != oiph->tos) { modflags |= AVM_PA_MOD_TOS|AVM_PA_MOD_IP4_CSUM; l3_check = hcsum_u16(l3_check, htons(iiph->tos), htons(oiph->tos)); } if (update_ttl) { modflags |= AVM_PA_MOD_TTL|AVM_PA_MOD_IP4_CSUM; l3_check = hcsum_u16(l3_check, constant_htons(0x0100), 0x0000); } mod->l3crc_update = hcsum_fold(l3_check); switch (iiph->protocol) { case IPPROTO_TCP: mod->l4crc_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: mod->l4crc_offset = offsetof(struct udphdr, check); break; case IPPROTO_ICMP: #ifdef _LINUX_ICMP_H mod->l4crc_offset = offsetof(struct icmphdr, checksum); #else mod->l4crc_offset = offsetof(struct icmphdr, check); #endif isicmp = 1; break; default: mod->l4crc_offset = 0; break; } mod->l4crc_update = 0; mod->l4crc_update_part = 0; if (mod->l4crc_offset) { u16 *iports = (u16 *)(in + PA_IPHLEN(iiph)); u16 *oports = (u16 *)(out + PA_IPHLEN(oiph)); if (isicmp) { l4_check = 0; mod->id = oports[2]; if (iports[2] != oports[2]) { modflags |= AVM_PA_MOD_ICMPID|AVM_PA_MOD_L4_CSUM; l4_check = hcsum_u16(l4_check, iports[2], oports[2]); } } else { if (modflags & AVM_PA_MOD_ADDRS) modflags |= AVM_PA_MOD_L4_CSUM; mod->sport = oports[0]; /* PARTIAL_CSUM case: tcph->check is prepared with IP addrs (pre-NAT), * checksum is will be computed over tcp (header + payload) so ports * must not be included in the update mask. */ mod->l4crc_update_part = hcsum_fold(l4_check); if (iports[0] != oports[0]) { modflags |= AVM_PA_MOD_SPORT|AVM_PA_MOD_L4_CSUM; l4_check = hcsum_u16(l4_check, iports[0], oports[0]); } mod->dport = oports[1]; if (iports[1] != oports[1]) { modflags |= AVM_PA_MOD_DPORT|AVM_PA_MOD_L4_CSUM; l4_check = hcsum_u16(l4_check, iports[1], oports[1]); } } mod->l4crc_update = hcsum_fold(l4_check); } mod->iphlen = PA_IPHLEN(oiph); return modflags; } static void pa_do_v4_mod_rec(struct avm_pa_v4_mod_rec *mod, u16 modflags, bool partial_csum, u8 *data) { struct avm_pa_global *ctx = &pa_glob; struct iphdr *iph = (struct iphdr *)data; u16 *ports = (u16 *)(data + mod->iphlen); u32 sum; u16 csum; ctx->stats.rx_mod++; if (modflags & AVM_PA_MOD_ADDRS) { if (((unsigned long)iph) & 0x3) { memcpy(&iph->saddr, &mod->saddr, 2*sizeof(u32)); } else { iph->saddr = mod->saddr; iph->daddr = mod->daddr; } } if (modflags & AVM_PA_MOD_TOS) iph->tos = mod->tos; if (modflags & AVM_PA_MOD_TTL) iph->ttl--; if (modflags & AVM_PA_MOD_IP4_CSUM) { sum = hcsum_prepare(iph->check); iph->check = hcsum_finish(hcsum_add(sum, mod->l3crc_update)); } if (modflags & AVM_PA_MOD_PORTS) { ports[0] = mod->sport; ports[1] = mod->dport; } else if (modflags & AVM_PA_MOD_ICMPID) { ports[2] = mod->id; } if (modflags & AVM_PA_MOD_L4_CSUM) { csum = ports[mod->l4crc_offset>>1]; if (csum || iph->protocol != IPPROTO_UDP) { if (partial_csum) { ports[mod->l4crc_offset>>1] = hcsum_fold(hcsum_add(csum, mod->l4crc_update_part)); } else { sum = hcsum_prepare(csum); ports[mod->l4crc_offset>>1] = hcsum_finish(hcsum_add(sum, mod->l4crc_update)); } } } } static void pa_show_v4_mod_rec(struct avm_pa_v4_mod_rec *mod, u16 modflags, pa_fprintf fprintffunc, void *arg) { char buf[64]; if (modflags & AVM_PA_MOD_SADDR) { in_addr2str(&mod->saddr, buf, sizeof(buf)); (*fprintffunc)(arg, "*IPv4 Src : %s\n", buf); } if (modflags & AVM_PA_MOD_DADDR) { in_addr2str(&mod->daddr, buf, sizeof(buf)); (*fprintffunc)(arg, "*IPv4 Dst : %s\n", buf); } if (modflags & AVM_PA_MOD_TOS) (*fprintffunc)(arg, "*IPv4 Tos : 0x%02x\n", mod->tos); if (modflags & AVM_PA_MOD_IP4_CSUM) (*fprintffunc)(arg, "*L3 Sum : update 0x%02x\n", mod->l3crc_update); if (modflags & AVM_PA_MOD_SPORT) (*fprintffunc)(arg, "*Src Port : %d\n", ntohs(mod->sport)); if (modflags & AVM_PA_MOD_DPORT) (*fprintffunc)(arg, "*Dst Port : %d\n", ntohs(mod->dport)); if (modflags & AVM_PA_MOD_ICMPID) (*fprintffunc)(arg, "*ICMP Id : %d\n", ntohs(mod->id)); if (modflags & AVM_PA_MOD_L4_CSUM) (*fprintffunc)(arg, "*L4 Sum : update 0x%02x\n", mod->l4crc_update); } /* ------------------------------------------------------------------------ */ static void pa_show_mod_rec(struct avm_pa_mod_rec *mod, pa_fprintf fprintffunc, void *arg) { (*fprintffunc)(arg, "IP version : %u\n", mod->ipversion); if (mod->pull_l2_len) (*fprintffunc)(arg, "L2 pull : %d\n", mod->pull_l2_len); if (mod->pull_encap_len) (*fprintffunc)(arg, "Encap pull : %d\n", mod->pull_encap_len); if (mod->push_encap_len) (*fprintffunc)(arg, "Push IPv : %u\n", mod->outer_ipversion); if (mod->push_udpoffset) (*fprintffunc)(arg, "Push UDP : %u\n", mod->push_udpoffset); if (mod->push_encap_len) { char buf[256]; data2hex(HDRCOPY(mod)+mod->push_l2_len, mod->push_encap_len, buf, sizeof(buf)); (*fprintffunc)(arg, "Encap push : %s\n", buf); } pa_show_v4_mod_rec(&mod->v4_mod, mod->modflags, fprintffunc, arg); if (mod->modflags & AVM_PA_MOD_TTL) (*fprintffunc)(arg, "*IPv%d TTL : decrease\n", mod->ipversion); } static int pa_egress_precheck(struct avm_pa_pid *pid, PKT *pkt, struct avm_pa_pkt_match *ingress, struct avm_pa_pkt_match *egress) { unsigned int hstart; int ret; if (pid->ecfg.flags & AVM_PA_PID_FLAG_HSTART_ON_EGRESS) hstart = AVM_PKT_INFO(pkt)->hstart; else hstart = 0; ret = pa_set_pkt_match(pid->egress_framing, hstart, pkt, egress, 1); if (ret != AVM_PA_RX_OK) return ret; if (!AVM_PA_PKTTYPE_BASE_EQ(egress->pkttype, ingress->pkttype)) return AVM_PA_RX_BYPASS; return AVM_PA_RX_OK; } static void pa_calc_modify(struct avm_pa_session *session, struct avm_pa_pkt_match *ingress, struct avm_pa_pkt_match *egress) { /* * Precondition: AVM_PA_PKTTYPE_BASE_EQ(egress->pkttype, ingress->pkttype) */ struct avm_pa_mod_rec *mod = &session->mod; mod->hdroff = egress->hdroff; memcpy(HDRCOPY(mod), HDRCOPY(egress), egress->hdrlen); mod->pkttype = egress->pkttype; if (AVM_PA_PKTTYPE_EQ(ingress->pkttype, egress->pkttype)) { if (ingress->encap_offset == AVM_PA_OFFSET_NOT_SET) { /* no tunnel, egress->encap_offset also not set */ mod->pull_l2_len = ingress->ip_offset; mod->pull_encap_len = 0; mod->ipversion = AVM_PA_PKTTYPE_IP_VERSION(egress->pkttype); mod->outer_ipversion = mod->ipversion; mod->push_encap_len = 0; mod->push_l2_len = egress->ip_offset; } else { /* untouched tunnel, egress->encap_offset also set */ mod->pull_l2_len = ingress->encap_offset; mod->pull_encap_len = 0; mod->ipversion = AVM_PA_PKTTYPE_IPENCAP_VERSION(egress->pkttype); mod->outer_ipversion = mod->ipversion; mod->push_encap_len = 0; mod->push_l2_len = egress->encap_offset; } } else { /* AVM_PA_PKTTYPE_BASE_EQ because of precheck */ BUG_ON(!ingress->encap_offset && !egress->encap_offset); if (ingress->encap_offset == AVM_PA_OFFSET_NOT_SET) { /* tunnel header only on egress */ mod->pull_l2_len = ingress->ip_offset; mod->pull_encap_len = 0; mod->ipversion = AVM_PA_PKTTYPE_IP_VERSION(egress->pkttype); mod->outer_ipversion = AVM_PA_PKTTYPE_IPENCAP_VERSION(egress->pkttype); mod->push_encap_len = egress->ip_offset - egress->encap_offset; mod->push_l2_len = egress->encap_offset; } else if (egress->encap_offset == AVM_PA_OFFSET_NOT_SET) { /* tunnel header only on ingress */ mod->pull_l2_len = ingress->encap_offset; mod->pull_encap_len = ingress->ip_offset - ingress->encap_offset; mod->ipversion = AVM_PA_PKTTYPE_IP_VERSION(egress->pkttype); mod->outer_ipversion = mod->ipversion; mod->push_encap_len = 0; mod->push_l2_len = egress->ip_offset; } else { /* different tunnel header on both ingress and egress (!AVM_PA_PKTTYPE_EQ) */ mod->pull_l2_len = ingress->encap_offset; mod->pull_encap_len = ingress->ip_offset - ingress->encap_offset; mod->ipversion = AVM_PA_PKTTYPE_IP_VERSION(egress->pkttype); mod->outer_ipversion = AVM_PA_PKTTYPE_IPENCAP_VERSION(egress->pkttype); mod->push_encap_len = egress->ip_offset - egress->encap_offset; mod->push_l2_len = egress->encap_offset; } } if (mod->push_encap_len) { if (egress->lisp_offset != AVM_PA_OFFSET_NOT_SET) { mod->push_udpoffset = egress->lisp_offset - egress->encap_offset; mod->push_udpoffset -= sizeof(struct udphdr); } } else { mod->push_udpoffset = 0; } if (mod->ipversion == 4) { int ingress_offset = mod->pull_l2_len + mod->pull_encap_len; int egress_offset = mod->push_l2_len + mod->push_encap_len; mod->modflags = pa_set_v4_mod_rec(&mod->v4_mod, test_bit(PA_S_ROUTED, &session->flags), HDRCOPY(ingress)+ingress_offset, HDRCOPY(mod)+egress_offset); } else if (mod->ipversion == 6) { if (test_bit(PA_S_ROUTED, &session->flags)) { mod->modflags = AVM_PA_MOD_TTL; } } else { BUG(); } } static u8 casttype2pkt_type[] = { PACKET_HOST, PACKET_MULTICAST, PACKET_BROADCAST }; /* ------------------------------------------------------------------------ */ /* -------- session retrieval and verification ---------------------------- */ /* ------------------------------------------------------------------------ */ static struct avm_pa_session * pa_session_get(avm_session_handle session_handle) { struct avm_pa_data *pd = &pa_data; struct avm_pa_session *session; session = PA_SESSION(pd, session_handle); if (!avm_pa_session_valid(session)) session = NULL; return session; } /* ------------------------------------------------------------------------ */ /* -------- packet forwarding --------------------------------------------- */ /* ------------------------------------------------------------------------ */ #ifdef CONFIG_AVM_PA_TX_NAPI static int pa_dev_tx_napi_poll(struct napi_struct *napi, int budget) { int done; struct avm_pa_pid *pid = container_of(napi, struct avm_pa_pid, tx_napi); for (done = 0; done < budget; done++) { PKT *pkt = skb_dequeue_tail(&pid->tx_napi_pkts); if (!pkt) break; pid->cfg.tx_func(pid->cfg.tx_arg, pkt); } if (done < budget) napi_complete(napi); return done; } #ifdef CONFIG_SMP static void __do_schedule_napi(struct napi_struct *napi) { int cpu = smp_processor_id(); int tcpu = cpumask_any_but(cpu_online_mask, cpu); if (tcpu >= nr_cpumask_bits) tcpu = cpu; /* This runs in a tasklet because we want to run the "core transition" per * packet burst, and not per packet. Both napi_schedule_prep() and IPIs (via * smp_call_function_single()) on a per packet basis would be too expensive in this * smp scenario. (napi_schedule_prep() does atomic accesses which requires snooping * the other cores caches, and the napi_poll runs one of the other cores). * * Furthermore, guarding the IPI with napi_schedule_prep() has been found to * perform a bit better than doing the IPI straight in this tasklet. */ if (napi_schedule_prep(napi)) smp_call_function_single(tcpu, (void*)__napi_schedule, napi, 0); } static void do_schedule_napi(struct avm_pa_pid *pid) { tasklet_schedule(&pid->tx_napi_tsk); } #else static void do_schedule_napi(struct avm_pa_pid *pid) { /* On UP the atomic access is a no-op */ napi_schedule(&pid->tx_napi); } #endif #endif static inline void pa_do_push_l2(struct avm_pa_egress *egress, PKT *pkt) { if (egress->push_l2_len) { memcpy(PKT_PUSH(pkt, egress->push_l2_len), HDRCOPY(&egress->match), egress->push_l2_len); if (egress->pppoe_offset != AVM_PA_OFFSET_NOT_SET) { unsigned char *data = PKT_DATA(pkt) + egress->pppoe_offset; struct pppoehdr *pppoehdr = (struct pppoehdr *)data; pppoehdr->length = htons(pkt->len - egress->pppoe_hdrlen); } } } static int _pa_transmit(struct avm_pa_egress *egress, PKT *pkt) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, egress->pid_handle); struct avm_pa_pkt_info *info = AVM_PKT_INFO(pkt); struct avm_pa_session *session __maybe_unused; #ifdef CONFIG_AVM_PA_TX_NAPI /* A non-NULL dev indicates avm_pa_dev_pid_register_tx_napi() was used */ if (pid->tx_napi.dev && (skb_queue_len(&pid->tx_napi_pkts) >= TX_NAPI_MAXQUEUE)) { PKT_FREE(pkt); /* drop packet, wouldn't fit anyway */ return NET_XMIT_DROP; } #endif /* * info->already_modified is set when paket comes * from avm_pa_tx_channel_accelerated_packet() and * statistics are counted in HW. * * 2016-03-02, calle */ if (info->already_modified == 0) { egress->sw_stats.tx_pkts++; egress->sw_stats.tx_bytes += pkt->len + egress->push_l2_len; } info->egress_pid_handle = egress->pid_handle; egress->tx_pkts++; pid->tx_pkts++; if (pid->ecfg.cb_len) memcpy(&pkt->cb[pid->ecfg.cb_start], egress->cb, pid->ecfg.cb_len); pkt->protocol = egress->match.protocol; switch (egress->type) { case avm_pa_egresstype_output: pa_do_push_l2(egress, pkt); pkt->tc_index = egress->output.tc_index; pkt->skb_iif = egress->output.skb_iif; pkt->mac_len = egress->output.mac_len; if (egress->match.vlan_proto) skb_vlan_tag_put(pkt, egress->match.vlan_proto, egress->match.vlan_tci); /* skb_flow_dissect() expects network_header to point * at the header indicated by skb->protocol */ if (pid->egress_framing == avm_pa_framing_ether) skb_set_network_header(pkt, ETH_HLEN); else skb_reset_network_header(pkt); pkt->pkt_type = PACKET_OUTGOING; /* Checksum handling: * * tl;dr: CHECKSUM_NONE unless checksum offload is requested by * CHECKSUM_PARTIAL. * * 1) Don't touch if already set to CHECKSUM_PARTIAL. * Either the packet is locally generated and checksum offloading is * is requested (especially in case of gso), or the packet is * "received" on some virtual device (e.g. ifb0) and the checksum offload * request is sticky, or the receive side (device or Linux) * performned GRO and segmentation must be done by the PID, including * checksum calculation. In all these events, keep CHECKSUM_PARTIAL. * 2) For normally received packets, ip_summed is assumed to be initialized * by the driver. If it verified the packet checksums or not doesn't * really matter, we blindly forward the packet. That means we must change * CHECKSUM_UNNECESSARY and CHECKSUM_COMPLETE to CHECKSUM_NONE. */ if (pkt->ip_summed != CHECKSUM_PARTIAL) pkt->ip_summed = CHECKSUM_NONE; skb_reset_mac_header(pkt); skb_set_queue_mapping(pkt, egress->output.txq_id); if (info->tcp_nodata) { pid->prioack_accl_acks++; egress->tcpack_pkts++; } pkt->priority = egress->output.priority; if (egress->output.dst) skb_dst_set(pkt, dst_clone(egress->output.dst)); secpath_reset(pkt); #ifdef CONFIG_BLOG if (avm_pa_pid_tack_enabled(pid)) { pkt->mark = SKBMARK_SET_Q(pkt->mark, (BROADCOM_MAX_PRIOS - (pkt->priority & TC_H_MIN_MASK))); pkt->mark = SKBMARK_SET_FLOW_ID(pkt->mark, 0x1); } /* * We have to let the flow cache know about the struct nf_conn * * entry of the connection, so flow cache registers an accelerated session * there. */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (!ctx->hw_ppa_disabled && (session = pa_session_get(info->session_handle)) && session->ct) { /* In case the skb already has a conntrack entry: * decrement refcount and overwrite it. */ if (skb_nfct(pkt)) { nf_conntrack_put(skb_nfct(pkt)); pkt->_nfct = 0; if (net_ratelimit()) pr_err("avm_pa: accelerated packet with exisitng nf_conn session_handle = %u\n", info->session_handle); } /* The nf_conn entry saved in the session contains the required generic_ct */ nf_conntrack_get(&session->ct->ct_general); nf_ct_set(pkt, session->ct, 0); /* * here we save the nf_conn entry in the blog extension of the skb * nothing is done if the blog_ptr(pkt) == NULL, this happens if we call blog_skip() */ blog_link(FLOWTRACK, blog_ptr(pkt), (void*) session->ct, session->generic_ct_dir ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL, 0 ); } #endif #endif #ifdef CONFIG_TI_PACKET_PROCESSOR /* * Relevant PP fields must be copied into the egress to ensure the PP handles * the packet correctly as if it had taken the entire slow path (via ARM). * * In JZ-68647 (Puma 7: Cert-Fail SF-02 proc-1.1, root cause), it was found * that we copied to little and added skb->ti_meta_info* to the list. In * JZ-69391 it was found that we copied too much and overwrite important * per-packet PP information and went back to a white list of individual fields. * * Reasoning: We don't need to store session information as the PP * session is already set up (or no session at all). We need to store QoS / SF * relevant fields that are used in the xmit routines of the interface drivers, * even if there is no PP session at all. Except ti_epi_header which contains * per-packet data set by the PP. */ SKB_GET_PP_INFO_P(pkt)->egress_queue = egress->output.puma_pktinfo.egress_queue; #ifdef CONFIG_TI_META_DATA pkt->ti_meta_info = egress->output.ti_meta_info; pkt->ti_meta_info2 = egress->output.ti_meta_info2; #endif #endif #if AVM_PA_TRACE if (ctx->dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - _pa_transmit(%s), prio=0x%X, info->match.ack_only=%d\n", pkt_uniq_id(pkt), pid->cfg.name, pkt->priority, info->tcp_nodata); #endif #ifdef CONFIG_AVM_PA_TX_NAPI if (pid->tx_napi.dev) { skb_queue_tail(&pid->tx_napi_pkts, pkt); do_schedule_napi(pid); } else #endif { (*pid->cfg.tx_func)(pid->cfg.tx_arg, pkt); ctx->stats.fw_output++; } return NET_XMIT_SUCCESS; case avm_pa_egresstype_local: { pkt->pkt_type = casttype2pkt_type[egress->match.casttype]; skb_dst_set(pkt, dst_clone(egress->local.dst)); secpath_reset(pkt); pkt->dev = egress->local.dev; pkt->skb_iif = egress->local.skb_iif; ctx->stats.fw_local++; (*pid->cfg.tx_func)(pid->cfg.tx_arg, pkt); } return NET_XMIT_SUCCESS; case avm_pa_egresstype_rtp: if (egress->rtp.sk) { size_t hsize; skb_set_network_header(pkt, 0); if (pkt->protocol == constant_htons(ETH_P_IP)) { struct iphdr *iph = (struct iphdr *)pkt->data; hsize = iph->ihl*4; } else { hsize = sizeof(struct ipv6hdr); } skb_pull(pkt, hsize); /* skb->data points to udphdr */ skb_set_transport_header(pkt, 0); pkt->pkt_type = casttype2pkt_type[egress->match.casttype]; pkt->skb_iif = egress->rtp.skb_iif; ctx->stats.fw_rtp++; (*egress->rtp.transmit)(egress->rtp.sk, pkt); return NET_XMIT_SUCCESS; } else { ctx->stats.fw_rtp_drop++; kfree_skb(pkt); return NET_XMIT_SUCCESS; } case avm_pa_egresstype_xfrm: if (IS_ENABLED(CONFIG_XFRM)) { pkt->dev = egress->xfrm.dev; skb_dst_set(pkt, dst_clone(egress->xfrm.dst)); secpath_reset(pkt); pkt->tc_index = egress->xfrm.tc_index; (*pid->cfg.tx_func)(egress->xfrm.x, pkt); } return NET_XMIT_SUCCESS; case avm_pa_egresstype_null: /* perhaps we should shortcut and drop even earlier */ consume_skb(pkt); return NET_XMIT_SUCCESS; } ctx->stats.fw_ill++; kfree_skb(pkt); return NET_XMIT_SUCCESS; } static void pa_transmit(struct avm_pa_egress *egress, struct sk_buff *skb, int bridged) { struct avm_pa_global *ctx = &pa_glob; struct sk_buff *nskb; avm_simple_profiling_skb(0, skb); /* * Bugfix: bridge packets were cut, when third position of * mac address was 0x00, because ethernet header * was used as IP/IPv6 header, and packets were * trimed and perhaps fragmented. * * packets for bridge sessions arrive with ethernet header, * we do not need fragmentation or size check here. * * 2014-07-08 calle */ if (bridged == 0) { skb = avm_pa_fragment(egress, skb); if (!skb) { ctx->stats.fw_frag_fail++; return; } if (skb->next) ctx->stats.fw_frags++; } do { nskb = skb->next; skb->next = NULL; if (_pa_transmit(egress, skb) == NET_XMIT_DROP) ctx->stats.fw_drop++; else ctx->stats.fw_pkts++; } while ((skb = nskb)); } static void pa_do_modify_l3(struct avm_pa_mod_rec *mod, PKT *pkt) { if (mod->ipversion == 4) { pa_do_v4_mod_rec(&mod->v4_mod, mod->modflags, pkt->ip_summed == CHECKSUM_PARTIAL, PKT_DATA(pkt)); } else if (mod->modflags & AVM_PA_MOD_TTL) { struct ipv6hdr *ipv6h = (struct ipv6hdr *)PKT_DATA(pkt); ipv6h->hop_limit--; } } static void pa_do_modify_non_l2(struct avm_pa_mod_rec *mod, PKT *pkt, int bridged) { /* The actual vlan_tci will be inserted on egress. */ skb_vlan_tag_clear(pkt); if (bridged) { /* We have to initialize skb->network_header for Linux' transmit paths. * For bridged we can safely assume ethernet (might be vlan tagged, * but that's OK as long as pkt->protocol agrees). */ skb_set_network_header(pkt, ETH_HLEN); skb_reset_mac_len(pkt); return; } if (mod->pull_l2_len) PKT_PULL(pkt, mod->pull_l2_len); if (mod->pull_encap_len) PKT_PULL(pkt, mod->pull_encap_len); /* We're now at the innermost l3 header, set offsets in the skb appropriately. * This is required for Linux' transmit paths and some drivers (but remember that * this is not done for bridged sessions). * Also remove any padding that might still be left from ingress L2. */ skb_reset_network_header(pkt); if (mod->ipversion == 4) { pskb_trim(pkt, ntohs(PA_IPTOTLEN(pkt->data))); skb_set_transport_header(pkt, mod->v4_mod.iphlen); } else if (mod->ipversion == 6) { pskb_trim(pkt, ntohs(PA_IP6_PAYLOADLEN(pkt->data)) + sizeof(struct ipv6hdr)); skb_set_transport_header(pkt, sizeof(struct ipv6hdr)); } if (mod->modflags) pa_do_modify_l3(mod, pkt); if (mod->push_encap_len) { unsigned tot_len; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) /* Checksum offloading might get it wrong if we don't indicate encapsulation */ pkt->encapsulation = 1; skb_reset_inner_network_header(pkt); skb_set_inner_transport_header(pkt, skb_transport_offset(pkt)); #endif memcpy(PKT_PUSH(pkt, mod->push_encap_len), HDRCOPY(mod)+mod->push_l2_len, mod->push_encap_len); skb_reset_network_header(pkt); tot_len = pkt->len; if (mod->outer_ipversion == 4) { struct iphdr *iph = (struct iphdr *)PKT_DATA(pkt); iph->id = rand() & 0xffff; iph->tot_len = htons(tot_len); skb_set_transport_header(pkt, PA_IPHLEN(iph)); ip_send_check(iph); } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)PKT_DATA(pkt); ipv6h->payload_len = htons(tot_len - sizeof(struct ipv6hdr)); skb_set_transport_header(pkt, sizeof(*ipv6h)); } if (mod->push_udpoffset) { struct udphdr *udph = (struct udphdr *)(PKT_DATA(pkt)+mod->push_udpoffset); udph->len = htons(tot_len - mod->push_udpoffset); if (mod->outer_ipversion == 4) set_udp_checksum((struct iphdr *)PKT_DATA(pkt), udph); else set_udpv6_checksum((struct ipv6hdr *)PKT_DATA(pkt), udph); } } } static void _pa_do_send_egress(struct avm_pa_session *session, struct sk_buff *skb, int bridged) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_egress *egress, *first; struct sk_buff *nskb; AVM_PKT_INFO(skb)->is_accelerated = 1; egress = first = avm_pa_first_egress(session); /* We can transmit to the egress in any order as long as the skbs per egress are * in order. This is optimized to avoid a copy in the common, single egress case. */ hlist_for_each_entry_continue_rcu(egress, egress_list) { if ((nskb = PKT_COPY(skb)) != 0) pa_transmit(egress, nskb, bridged); else ctx->stats.fw_fail++; } pa_transmit(first, skb, bridged); } /* Pass NULL for session to to get it from the packet. Do this if there is uncertainty if * the session is still valid, i.e. if the packet was queued and the the RCU read side * critical section was left. * * If the session is given, we're still inside the RCU lock of avm_pa_pid_receive(). * Otherwise the caller must enter a new RCU read side. */ static void pa_do_modify_and_send(struct avm_pa_session *session, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct sk_buff *nskb; int nfrags, bridged; avm_simple_profiling_skb(0, skb); if (!session) { /* Protect against possible race with GC timer deleting sessions */ session = pa_session_get(AVM_PKT_INFO(skb)->session_handle); if (unlikely(!session)) { ctx->stats.fw_drop_gone++; PKT_FREE(skb); return; } } BUG_ON(AVM_PKT_INFO(skb)->session_uniq_id != session->uniq_id); /* TODO: Paged skbs but do they really correlate to individual packets? */ nfrags = 1; skb_walk_frags(skb, nskb) nfrags += 1; bridged = session->bsession != NULL; if (AVM_PKT_INFO(skb)->already_modified) { _pa_do_send_egress(session, skb, bridged); } else { session->ingress_sw_stats.tx_bytes += PKT_LEN(skb) + skb->data_len; session->ingress_sw_stats.tx_pkts += nfrags; pa_do_modify_non_l2(&session->mod, skb, bridged); _pa_do_send_egress(session, skb, bridged); if (session->timeout == 0) pa_session_flush(session, "fast timeout"); } } static int pa_egress_size_check(struct avm_pa_session *session, struct sk_buff *skb, int hdr_off) { struct avm_pa_pkt_match *info = &session->ingress; /* Check if the (inner) header allows for fragmentation. If not, and * there is no tunnel on egress, then the packet must not exceed the MTU of * the egress. Slow path will proably drop it and generate an ICMP error. */ if (AVM_PA_PKTTYPE_IP_VERSION(info->pkttype) == 4) { struct iphdr *iph = (struct iphdr *) (skb->data - hdr_off + info->ip_offset); if ((iph->frag_off & constant_htons(IP_DF)) == 0) return 0; } if (session->mod.push_encap_len == 0) { /* no tunnel on output */ struct avm_pa_mod_rec *mod = &session->mod; unsigned len = skb->len + hdr_off - mod->pull_l2_len - mod->pull_encap_len; struct avm_pa_egress *egress; avm_pa_for_each_egress(egress, session) { if (len > egress->mtu) return -1; } } return 0; } /* ------------------------------------------------------------------------ */ /* -------- macaddr management -------------------------------------------- */ /* ------------------------------------------------------------------------ */ /* There are two forms of macaddrs, pvid and non-pvid. * * pvid macaddrs are the common ones. They are used when sessions when * VLAN changes between ingress and egress or if no vlan is involved at all. * * non-pvid macaddrs are used only if the vlan between ingress and egress * does not change. * * The purpose of non-pvid macaddrs is to avoid flushing sessions in * the ingress pid change logic when a known ethernet address is observed * in a different vlan. As long as the vlan is known by the means of * a non-pvid macaddr (which means: there is a related session with * the same vlan on egress) then the packet is accepted and maybe accelerated. * * Otherwise, when an ethernet address is seen with a unknown vlan, then * we assume that the host has been moved to a different PID, i.e. one * that is based on virtual vlan interface. Then we flush * all sessions that hold the corresponding pvid session. This part is * crucial for bridging sessions. * * macaddrs are allocated for each egress per session. But multiple egress * may share macaddrs and therefore macaddrs are reference counted. * pvid and non-pvid macaddrs do not share refcounts, i.e. both can exist * without the other. But there is one catch: non-pvid macaddrs are only * fully deleted when there is no corresponding pvid macaddr, so that * we don't forget about "proper vlans" as long as there are pvid macaddrs * present (otherwise we would flush too early when a vlan packet * is observed again). These zero-reference non-pvid macaddrs are deleted * when there is no pvid macaddrs left. In the meantime they can * be looked up by the pid change logic (and also new egress for new sessions). * * Internally, pvid and non-pvid share the same hash bucket, because * only the address is hashed. However, pvid macaddrs are head-inserted * while non-pvid macaddrs are tail-inserted. This allows for quick * decision whether pvid macaddrs exist at all for a given address. */ #define PA_MACADDR_NON_PVID_OFFSET 0x1000000 #define PA_MACADDR_IS_PVID(macaddr) (!(macaddr->refcount & PA_MACADDR_NON_PVID_OFFSET)) #define PA_MACADDR_REFCOUNT(macaddr) (macaddr->refcount & ~PA_MACADDR_NON_PVID_OFFSET) static struct vlan_ethhdr * pa_get_ethhdr(enum avm_pa_framing framing, struct sk_buff *skb) { if (framing == avm_pa_framing_ether) return (struct vlan_ethhdr *) skb->data; if (framing == avm_pa_framing_dev) return vlan_eth_hdr(skb); return 0; } static u16 pa_get_vlan_id(enum avm_pa_framing framing, struct sk_buff *skb) { struct vlan_ethhdr *ethh = pa_get_ethhdr(framing, skb); if (skb_vlan_tag_present(skb)) return skb_vlan_tag_get_id(skb); else if ( ethh->h_vlan_proto == htons(ETH_P_8021Q) || ethh->h_vlan_proto == htons(ETH_P_8021AD)) return ntohs(ethh->h_vlan_TCI) & VLAN_VID_MASK; /* The null VID is equivlant to no VID, the header contains only priority information. * Therefore we can indicate the null VID if there is no vlan header. * See IEEE 802.1q */ return 0; } static u16 pa_get_vlan_match(struct avm_pa_pkt_match *match) { struct avm_pa_match_info *info = pa_find_eth_match(match); struct vlanhdr *vlanh; if (!info) return 0; /* vlan follows ethernet */ info += 1; if (info->type != AVM_PA_VLAN) return 0; if (info->offset == AVM_PA_OFFSET_NOT_SET) return match->vlan_tci; vlanh = (struct vlanhdr *) (HDRCOPY(match) + info->offset); return ntohs(vlanh->vlan_tci); } static size_t pa_macaddr2str(struct avm_pa_macaddr *macaddr, char *buf, size_t sz) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, macaddr->pid_handle); char vlan_buf[16] = ""; if (macaddr->vlan_id) sprintf(vlan_buf, " vlan %u", macaddr->vlan_id); return snprintf(buf, sz, "%pM%s%s ref %3lu pid %2d (%s)", macaddr->mac, vlan_buf, PA_MACADDR_IS_PVID(macaddr) ? " pvid" : "", PA_MACADDR_REFCOUNT(macaddr), pid->pid_handle, pid->cfg.name); } static void pa_show_macaddr(struct avm_pa_macaddr *macaddr, pa_fprintf fprintffunc, void *arg) { char buf[128]; pa_macaddr2str(macaddr, buf, sizeof(buf)); (*fprintffunc)(arg, "Macaddr : %s\n", buf); } static inline u32 macaddr_hash(const unsigned char mac[ETH_ALEN]) { u32 h = 0; int i; for (i=0; i < ETH_ALEN; i++) { h += mac[i]; h += (h<<10); h ^= (h>>6); } h += (h<<3); h ^= (h>>11); h += (h<<15); return h; } static struct avm_pa_macaddr * pa_macaddr_link(unsigned char mac[ETH_ALEN], avm_pid_handle pid_handle, bool is_pvid, u16 vlan_id) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_macaddr *p; u32 hash; int i; /* No macaddr for broadcast or multicast as we don't expect those on * ingress and therefore don't need them in the pid change logic. */ if (mac[0] & 1) return NULL; /* The hash covers only the ethernet addresses so that * avm_pa_macaddrs that differ only in vlan share the same bucket. */ hash = macaddr_hash(mac) % AVM_PA_MAX_MACADDR; spin_lock(&avm_pa_lock); /* First, try to locate existing entries. For pvid entries, the actual * vlan id doesn't matter. For non-pvid entries, the vlan id must match */ hlist_for_each_entry_rcu(p, &ctx->macaddr_hashtab[hash], macaddr_list) { if (ether_addr_equal(p->mac, mac)) { if (is_pvid && PA_MACADDR_IS_PVID(p)) goto out; else if (!is_pvid && !PA_MACADDR_IS_PVID(p) && vlan_id == p->vlan_id) goto out; } } for (i=0; i < ARRAY_SIZE(ctx->macaddr_array); i++) { p = &ctx->macaddr_array[i]; if (p->refcount == 0) { memcpy(p->mac, mac, ETH_ALEN); /* pvid macaddrs are always added to the head so that they come before * non-pvid macaddrs for the same address. Allows to cancel lookups * for pvid macaddrs early. */ if (is_pvid) { hlist_add_head_rcu(&p->macaddr_list, &ctx->macaddr_hashtab[hash]); } else { p->refcount = PA_MACADDR_NON_PVID_OFFSET; hlist_add_tail_rcu(&p->macaddr_list, &ctx->macaddr_hashtab[hash]); } if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: new macaddr:\n"); pa_show_macaddr(p, pa_printk, KERN_DEBUG); } goto out; } } out: p->pid_handle = pid_handle; p->vlan_id = vlan_id; p->refcount++; spin_unlock(&avm_pa_lock); return p; } static struct avm_pa_macaddr * pa_macaddr_find_pvid(const char mac[ETH_ALEN]) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_macaddr *p; u32 hash; hash = macaddr_hash(mac) % AVM_PA_MAX_MACADDR; hlist_for_each_entry_rcu(p, &ctx->macaddr_hashtab[hash], macaddr_list) { if (ether_addr_equal(p->mac, mac)) { if (PA_MACADDR_IS_PVID(p)) return p; /* There is no pvid macaddr if this isn't one as pvid macaddrs are * always inserted at head. */ break; } } return NULL; } static void pa_macaddr_unlink(struct avm_pa_macaddr *destmac) { struct avm_pa_global *ctx = &pa_glob; spin_lock(&avm_pa_lock); destmac->refcount--; if (PA_MACADDR_REFCOUNT(destmac) > 0) goto unlock; if (!PA_MACADDR_IS_PVID(destmac)) { /* This is a non-pvid macaddr that isn't referenced by sessions anymore. * * As long as there is a pvid macaddr we keep this non-pvid macaddr * in a floating state where it can be used to a) prevent * extraneous "pid change" events and b) can be looked * up by pa_macaddr_link() in case of new sessions. */ if (pa_macaddr_find_pvid(destmac->mac) != NULL) goto unlock; /* Clear non-refcount bits and allow pa_macaddr_link() re-use this macaddr. * No memset()! destmac might be currently used in an RCU read side. */ destmac->refcount = 0; } else { /* This is a pvid macaddr that isn't referenced by sessions anymore. * * The above code puts non-pvid macaddrs in a floating state in the * presence of pvid macaddrs. So when we unlink the pvid macaddr we * must garbage-collect those floating non-pvid macaddrs now. * * The refcount must be checked! Floating macaddrs could be * referenced by new sessions in the meantime (i.e. not floating anymore). */ struct avm_pa_macaddr *p = destmac; hlist_for_each_entry_continue_rcu(p, macaddr_list) { if (ether_addr_equal(p->mac, destmac->mac)) { if (PA_MACADDR_IS_PVID(p)) { /* cannot happen and indicates a problem in the code */ pr_warn_ratelimited("duplicated pvid macaddr\n"); continue; } if (PA_MACADDR_REFCOUNT(p) == 0) { p->refcount = 0; hlist_del_rcu(&p->macaddr_list); /* Because of the _rcu semantics of the traversal we can delete * and still continue traversal because next pointer remains intact. */ } } } } hlist_del_rcu(&destmac->macaddr_list); unlock: if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: delete macaddr:\n"); pa_show_macaddr(destmac, pa_printk, KERN_DEBUG); } spin_unlock(&avm_pa_lock); } /* must be called inside rcu read side */ static void pa_check_and_handle_ingress_pid_change(unsigned char mac[ETH_ALEN], avm_pid_handle pid_handle, u16 vlan_id) { struct avm_pa_macaddr *p, *p_pvid; struct avm_pa_global *ctx = &pa_glob; u32 hash; int pid_group = PA_PID(ctx, pid_handle)->ecfg.pid_group; bool pid_changed = false; hash = macaddr_hash(mac) % AVM_PA_MAX_MACADDR; /* Look first if the low-level pid has changed. The pid is the same * for related pvid and non-pvid macaddrs, so the first mismatch is * sufficient to to trigger pid change. * If the pid has not changed, then we check vlan to detect * changes between vlan interfaces that use the same low-level pid. * We can stop looking if we find a macaddrs with the same vlan whether * this is a pvid macaddr or non-pvid. If we don't find a matching vlan * (and also no pid mismatch), then the vlan has changed and we must * flush all sessions that belong to the pvid macaddrs. non-pvid * macaddrs are not considered for flushing in that case because we assume * that only the "primary vlan" has changed and all other vlans are intact. */ p_pvid = NULL; hlist_for_each_entry_rcu(p, &ctx->macaddr_hashtab[hash], macaddr_list) { if (ether_addr_equal(p->mac, mac)) { if (p->pid_handle != pid_handle) { struct avm_pa_pid *pid = PA_PID(ctx, p->pid_handle); if (pid->ingress_pid_handle != pid_handle) { if (pid_group == 0 || pid_group != pid->ecfg.pid_group) { pid_changed = true; break; } } } else if (!PA_PID(ctx, pid_handle)->bridging_ok) { /* If the pid doesn't allow for bridge sessions then vlan checks can be * skipped. All sessions that egress to a !bridging_ok pid are fully classified * and there is no uncertainty about which vlan tag to add. */ return; } else if (p->vlan_id == vlan_id) { /* If vlan_id matches the vlan of the macaddr then it's alright. */ return; } else if (PA_MACADDR_IS_PVID(p)) { p_pvid = p; } } } { int old = ctx->stats.sess_flushed; if (pid_changed) { net_info_ratelimited("avm_pa: pid change (pid) for %pM (%s(%d) -> %s(%d))\n", p->mac, PA_PID(ctx, p->pid_handle)->cfg.name, p->pid_handle, PA_PID(ctx, pid_handle)->cfg.name, pid_handle); avm_pa_flush_sessions_for_mac(p->mac); } else if (p_pvid) { char vlan1[16] = "none"; char vlan2[16] = "none"; if (p_pvid->vlan_id) snprintf(vlan1, sizeof(vlan1), "%d", p_pvid->vlan_id); if (vlan_id) snprintf(vlan2, sizeof(vlan2), "%d", vlan_id); net_info_ratelimited("avm_pa: pid change (pvid) for %pM (%s -> %s)\n", p_pvid->mac, vlan1, vlan2); avm_pa_flush_sessions_with_destmac(p_pvid); } ctx->stats.sess_pidchanged += ctx->stats.sess_flushed - old; } } /* ------------------------------------------------------------------------ */ /* -------- pid life cycle management ------------------------------------- */ /* ------------------------------------------------------------------------ */ static void _pa_hw_pa_release(struct kref *ref) { struct avm_pa_global *ctx = &pa_glob; ctx->hardware_pa.flags = 0; if (ctx->hw_pa_flush_completion) { complete(ctx->hw_pa_flush_completion); ctx->hw_pa_flush_completion = NULL; } } static int pa_hw_pa_get(void) { struct avm_pa_global *ctx = &pa_glob; return kref_get_unless_zero(&ctx->hw_pa_ref); } static int pa_hw_pa_put(void) { struct avm_pa_global *ctx = &pa_glob; return kref_put(&ctx->hw_pa_ref, _pa_hw_pa_release); } static int pa_hw_pa_valid(struct avm_hardware_pa *hwpa) { if (!hwpa->remove_session) return 0; /* exactly one of add_session or add_session_skb must be set */ if (hwpa->add_session && !hwpa->add_session_skb) return 1; if (!hwpa->add_session && hwpa->add_session_skb) return 1; /* probe_session will become mandatory as well after some transition period */ return 0; } static void inline pa_pid_init(avm_pid_handle pid_handle, struct avm_pa_pid_cfg *cfg) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); BUG_ON(pid_handle == 0); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (before)\n", pid_handle, kref_read(&pid->ref), "pa_pid_init", (void *)_RET_IP_); #endif spin_lock_bh(&avm_pa_lock); /* Do not call pa_pid_get() on purpose. That would check "pid->pid_handle == 0" * in addition to the actual refcount, and return no new reference in that case. * But we want to detect if we're being called while no new reference are allowed */ if (kref_get_unless_zero(&pid->ref) == 0) { memset(pid, 0, sizeof(struct avm_pa_pid)); kref_init(&pid->ref); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (after)\n", pid_handle, kref_read(&pid->ref), "pa_pid_init(new)", (void *)_RET_IP_); #endif } else { pr_err("avm_pa: pid %d (%s) ref %d already registered\n", pid_handle, cfg->name, kref_read(&pid->ref)); spin_unlock_bh(&avm_pa_lock); BUG(); } pid->pid_handle = pid_handle; pid->cfg = *cfg; if (pid->cfg.default_mtu == 0) pid->cfg.default_mtu = 1500; pid->bridging_ok = 1; pid->ingress_framing = cfg->framing; switch (cfg->framing) { case avm_pa_framing_llcsnap: case avm_pa_framing_ether: case avm_pa_framing_ppp: case avm_pa_framing_ip: case avm_pa_framing_ipdev: pid->egress_framing = cfg->framing; pid->cfg.ptype = 0; break; case avm_pa_framing_dev: pid->egress_framing = avm_pa_framing_ether; pid->cfg.ptype = 0; break; case avm_pa_framing_ptype: pid->egress_framing = cfg->framing; pid->cfg.tx_func = 0; pid->cfg.tx_arg = 0; avm_pa_pid_activate_hw_accelaration(pid_handle); break; } spin_unlock_bh(&avm_pa_lock); } static void _pa_pid_delete(struct kref *ref) { struct avm_pa_pid *pid = container_of(ref, struct avm_pa_pid, ref); struct avm_pa_pid_hwinfo *hw = pid->hw; struct completion *done = pid->release_completion; /* Only cleared by avm_pa_dev_unregister(). It is a bug if the * ref drops to 0 without going through that function. */ BUG_ON(pid->pid_handle != 0); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf\n", pid->pid_handle, kref_read(&pid->ref), "_pa_pid_delete", (void *)_RET_IP_); #endif pid->ingress_pid_handle = 0; pid->hw = NULL; pid->release_completion = NULL; kfree(hw); if (done) complete(done); /* keep cfg for reuse by name */ } /* * Given a pid_handle, decrease the ref count of the corresponding avm_pa_pid. * Resources are released if the ref count drops to zero. * * Returns 1 if the pid_handle was removed, otherwise 0. */ static int pa_pid_put(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); int ret; BUG_ON(pid_handle == 0); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (before)\n", pid_handle, kref_read(&pid->ref), "pa_pid_put", (void *)_RET_IP_); #endif ret = kref_put(&pid->ref, _pa_pid_delete); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (after)\n", pid_handle, kref_read(&pid->ref), "pa_pid_put", (void *)_RET_IP_); #endif return ret; } /* * Given a pid_handle, increase the ref count of the corresponding avm_pa_pid * * Each session holds a ref on all pids involved. So if you have a valid session, * (as per pa_session_valid()) use PA_PID() instead, especially in the fast path, as * refcounting is unecessarily expensive. * * If the pid is not registered, 0 is returned and the ref count is restored. */ static avm_pid_handle pa_pid_get(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); BUG_ON(pid_handle == 0); #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (before)\n", pid_handle, kref_read(&pid->ref), "pa_pid_get", (void *)_RET_IP_); #endif if (kref_get_unless_zero(&pid->ref) == 0) return 0; #if AVM_PA_REF_DEBUG pr_err("avm_pa: pid %d ref %d (%s) called from %pf (after)\n", pid_handle, kref_read(&pid->ref), "pa_pid_get", (void *)_RET_IP_); #endif if (pid->pid_handle != pid_handle) { /* avm_pa_dev_unregister() clears pid->pid_handle to prevent new references */ kref_put(&pid->ref, _pa_pid_delete); return 0; } return pid->pid_handle; } /* * Given a pid_handle, increase the ref count of the corresponding avm_pa_pid and return it. * * If the pid is not registered, NULL is returned and the ref count is restored. */ static struct avm_pa_pid * pa_pid_get_pid(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; avm_pid_handle n; if (!pid_handle) return NULL; n = pa_pid_get(pid_handle); return n ? PA_PID(ctx, n) : NULL; } /* Uninlined versions for other modules, hot code paths should use pa_pid_get(). */ struct avm_pa_pid * avm_pa_pid_get_pid(avm_pid_handle pid_handle) { return pa_pid_get_pid(pid_handle); } int avm_pa_pid_put(avm_pid_handle pid_handle) { return pa_pid_put(pid_handle); } /* same for vpid, but don't tell there is no reference counting yet */ struct avm_pa_vpid * avm_pa_vpid_get_vpid(avm_vpid_handle vpid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_vpid *vpid = PA_VPID(ctx, vpid_handle); if (vpid->vpid_handle) return vpid; return NULL; } int avm_pa_vpid_put(avm_vpid_handle vpid_handle) { return 1; } /* ------------------------------------------------------------------------ */ /* -------- bsession management ------------------------------------------- */ /* ------------------------------------------------------------------------ */ static inline __be16 pa_vlanh_l3proto(struct vlan_ethhdr *ethh) { if ( ethh->h_vlan_proto == __constant_htons(ETH_P_8021Q) || ethh->h_vlan_proto == __constant_htons(ETH_P_8021AD)) return ethh->h_vlan_encapsulated_proto; else return ethh->h_vlan_proto; } static inline u16 pa_vlanh_vid(struct vlan_ethhdr *ethh) { if ( ethh->h_vlan_proto == __constant_htons(ETH_P_8021Q) || ethh->h_vlan_proto == __constant_htons(ETH_P_8021AD)) return ntohs(ethh->h_vlan_TCI) & VLAN_VID_MASK; else return 0; } static inline u32 pa_bkey(struct vlan_ethhdr *ethh, u16 vlan_tci) { u16 vid; if (vlan_tci) vid = vlan_tci & VLAN_VID_MASK; else vid = pa_vlanh_vid(ethh); return pa_vlanh_l3proto(ethh) | vid << 16; } static inline u32 pa_bhash(struct vlan_ethhdr *ethh, u16 vlan_tci) { return jhash_3words(get_unaligned((u32 *)(ðh->h_source[2])), get_unaligned((u32 *)(ðh->h_dest[2])), pa_bkey(ethh, vlan_tci), 0); } static inline struct avm_pa_session * pa_bsession_hash_search(struct avm_pa_pid *pid, u32 hash, struct vlan_ethhdr *ethh, u32 key) { struct avm_pa_data *pd = &pa_data; struct avm_pa_bsession *p = NULL; u32 h = hash%AVM_PA_MAX_HASH; rcu_read_lock(); /* The primary identifier for bsessions is the MAC address pair. The ingress vlan id * and l3 protocol are additonal keys that must match. MAC addresses and l3 protocol * are fixed for a given bsession and cannot change. Vlan id may change between * ingress and egress iff the packet data is not modified, i.e. vlan is * signalled out-of-bad via skb->vlan_tci. */ hlist_for_each_entry_rcu(p, &pid->hash_bsess[h], hash_list) { /* Don't consider flushed sessions */ if (!memcmp(ethh, p->hdr, ETH_ALEN*2) && key == p->key) { if (!test_bit(PA_S_FLUSHED, &PA_SESSION(pd, p->session_handle)->flags)) break; } } rcu_read_unlock(); return p ? PA_SESSION(pd, p->session_handle) : NULL; } static inline struct avm_pa_session * pa_bsession_search(struct avm_pa_pid *pid, struct vlan_ethhdr *ethh, u16 vlan_tci) { return pa_bsession_hash_search(pid, pa_bhash(ethh, vlan_tci), ethh, pa_bkey(ethh, vlan_tci)); } static void pa_change_to_bridge_session(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_bsession *p = &ctx->bsess_array[session->session_handle]; struct avm_pa_pkt_match *match = &session->ingress; struct avm_pa_match_info *info = pa_find_eth_match(match); BUG_ON(!info); INIT_HLIST_NODE(&p->hash_list); p->hdr = (struct vlan_ethhdr *) (HDRCOPY(match) + info->offset); p->hash = pa_bhash(p->hdr, match->vlan_tci); p->key = pa_bkey(p->hdr, match->vlan_tci); p->session_handle = session->session_handle; ctx->stats.nbsessions++; pa_change_to_bridge_match(match); pa_change_to_bridge_match(&avm_pa_first_egress(session)->match); session->timeout = ctx->bridge_timeout_secs*HZ; session->bsession = p; } /* ------------------------------------------------------------------------ */ /* -------- session management -------------------------------------------- */ /* ------------------------------------------------------------------------ */ /* Search for ACTIVE sessions */ #define pa_session_search(pid, match) pa_session_hash_search(pid, match) static struct avm_pa_session * pa_session_hash_search(struct avm_pa_pid *pid, struct avm_pa_pkt_match *ingress) { struct avm_pa_session *p; u32 h = ingress->hash%AVM_PA_MAX_HASH; rcu_read_lock(); hlist_for_each_entry_rcu(p, &pid->hash_sess[h], hash_list) { /* Don't consider flushed sessions */ if (pa_match_eq(ingress, &p->ingress) && !test_bit(PA_S_FLUSHED, &p->flags)) break; } rcu_read_unlock(); return p; } static void pa_session_hash_insert(struct avm_pa_pid *pid, struct avm_pa_session *session) { struct avm_pa_bsession *bsession = session->bsession; u32 h = session->ingress.hash%AVM_PA_MAX_HASH; hlist_add_head_rcu(&session->hash_list, &pid->hash_sess[h]); if (bsession) { h = bsession->hash%AVM_PA_MAX_HASH; hlist_add_head_rcu(&bsession->hash_list, &pid->hash_bsess[h]); } } static void pa_session_hash_delete(struct avm_pa_pid *pid, struct avm_pa_session *session) { struct avm_pa_bsession *bsession = session->bsession; hlist_del_init_rcu(&session->hash_list); if (bsession) hlist_del_init_rcu(&bsession->hash_list); } static void pa_session_list_delete(struct avm_pa_session *session) { if (session->on_list < AVM_PA_LIST_MAX) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session_list *list = &ctx->sess_list[session->on_list]; BUG_ON(list->nsessions == 0 || list_empty(&list->sessions)); session->on_list = AVM_PA_LIST_MAX; list_del_rcu(&session->session_list); list->nsessions--; } } static void pa_session_list_update(struct avm_pa_session *session, int which) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session_list *list = &ctx->sess_list[which]; pa_session_list_delete(session); list->nsessions++; if (list->nsessions > list->maxsessions) list->maxsessions = list->nsessions; list_add_rcu(&session->session_list, &list->sessions); session->on_list = which; /* Ensure the GC timer runs if sessions are on any list (except FREE). * mod_timer() only if necessary, to maintain the ~0.5s interval even if * sessions are constantly added or removed */ if (which != AVM_PA_LIST_FREE && !timer_pending(&ctx->tick_timer)) mod_timer(&ctx->tick_timer, jiffies + AVM_PA_TICK_RATE); } static void pa_session_update(struct avm_pa_session *session) { /* Update endtime regardless of the session state, the endtime is only relevant * in state ACTIVE (a previous BUG_ON() was regularly triggered, see JZ-43644). */ session->endtime = jiffies + session->timeout; } static int pa_session_activate(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *ipid, *epid; struct avm_pa_session *s = NULL; struct avm_pa_bsession *bs = NULL; ipid = PA_PID(ctx, session->ingress_pid_handle); epid = PA_PID(ctx, session->static_egress.pid_handle); spin_lock(&avm_pa_lock); /* Move to ACTIVE only if no "same session" exists and PIDs are ready to use. * * Session creation can happen concurrently, but after this call only one * session of a kind may exist (to avoid confusing hardware acceleration), so the * hash lookup finds if anyone else won the race. * PID deregistration can also happen concurrently. Therefore we need * check if pid->pid_handle is still valid (inside the lock). We don't * need a full reference because they are hold by the session. */ if ((bs = session->bsession)) s = pa_bsession_hash_search(ipid, bs->hash, bs->hdr, bs->key); else s = pa_session_hash_search(ipid, &session->ingress); if (ipid->pid_handle && epid->pid_handle && s == 0) { pa_session_hash_insert(ipid, session); pa_session_list_update(session, AVM_PA_LIST_ACTIVE); pa_session_update(session); #if (defined(CONFIG_AVM_GENERIC_CONNTRACK) || defined(CONFIG_AVM_PA_GENERIC_CT)) /* session->generic_ct is shared between sessions and access must be locked. * See comment at pa_session_kill_nolock(). */ if (session->generic_ct) { u32 session_handle = (u32)session->session_handle; generic_ct_sessionid_set(session->generic_ct, session->generic_ct_dir, (void *)session_handle); } #endif /* The sessions is now permanent, so are the sessions references to the pids. */ } else { /* Session wasn't on state ACTIVE yet, so it's safe to kill without flush. * This will release the session's references as well */ pa_session_kill_nolock(session, s ? "lost creation race" : "pid gone"); } spin_unlock(&avm_pa_lock); return s ? AVM_PA_TX_SESSION_EXISTS : AVM_PA_TX_SESSION_ADDED; } static void __init avm_pa_init_freelist(void) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; struct avm_pa_session_list *free_list = &ctx->sess_list[AVM_PA_LIST_FREE]; int i; for (i = CONFIG_AVM_PA_MAX_SESSION - 1; i > 0; i--) { struct avm_pa_session *session = PA_SESSION(pd, i); list_add(&session->session_list, &free_list->sessions); session->on_list = AVM_PA_LIST_FREE; } free_list->maxsessions = free_list->nsessions = CONFIG_AVM_PA_MAX_SESSION - 1; for (i = ARRAY_SIZE(pd->egress_pool) - 1; i >= 0; i--) { struct avm_pa_egress *egress = &pd->egress_pool[i]; hlist_add_head(&egress->egress_list, &ctx->egress_freelist); } } static struct avm_pa_session *pa_session_alloc(struct avm_pa_pkt_match *match) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; struct avm_pa_session *session; struct avm_pa_session_list *free_list = &ctx->sess_list[AVM_PA_LIST_FREE]; session = NULL; spin_lock(&avm_pa_lock); if (!list_empty(&free_list->sessions)) { session = list_first_entry(&free_list->sessions, struct avm_pa_session, session_list); pa_session_list_delete(session); memset(session, 0, sizeof(struct avm_pa_session)); INIT_HLIST_NODE(&session->hash_list); INIT_LIST_HEAD(&session->session_list); INIT_HLIST_HEAD(&session->egress_head); INIT_HLIST_HEAD(&session->groups); hlist_add_head_rcu(&session->static_egress.egress_list, &session->egress_head); session->negress = 1; session->session_handle = session - pd->sessions; session->on_list = AVM_PA_LIST_MAX; session->uniq_id = atomic_inc_return(&ctx->session_uniq_id); session->ingress = *match; session->starttime = jiffies; session->endtime = jiffies; switch (AVM_PA_PKTTYPE_IPPROTO(match->pkttype)) { case IPPROTO_TCP: session->timeout = ctx->tcp_timeout_secs*HZ; break; case IPPROTO_UDP: case IPPROTO_ESP: case IPPROTO_L2TP: session->timeout = ctx->udp_timeout_secs*HZ; break; case IPPROTO_ICMPV6: case IPPROTO_ICMP: session->timeout = ctx->echo_timeout_secs*HZ; break; default: session->timeout = ctx->bridge_timeout_secs * HZ; break; } } spin_unlock(&avm_pa_lock); return session; } static struct avm_pa_egress * pa_egress_alloc(void) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_egress *egress; egress = NULL; spin_lock(&avm_pa_lock); if (!hlist_empty(&ctx->egress_freelist)) { egress = hlist_entry(hlist_first_rcu(&ctx->egress_freelist), struct avm_pa_egress, egress_list); hlist_del_rcu(&egress->egress_list); memset(egress, 0, sizeof(*egress)); INIT_HLIST_NODE(&egress->egress_list); } spin_unlock(&avm_pa_lock); return egress; } void pa_egress_free(struct avm_pa_egress *egress) { struct avm_pa_global *ctx = &pa_glob; spin_lock(&avm_pa_lock); if (!hlist_unhashed(&egress->egress_list)) hlist_del_rcu(&egress->egress_list); hlist_add_head_rcu(&egress->egress_list, &ctx->egress_freelist); spin_unlock(&avm_pa_lock); } static void pa_show_vlan_match(struct avm_pa_pkt_match *match, struct avm_pa_match_info *info, pa_fprintf fprintffunc, void *arg) { if (!info) { info = pa_find_eth_match(match); if (!info || (++info)->type != AVM_PA_VLAN) return; } /* At this time, AVM_PA_OFFSET_NOT_SET is only possible for type == AVM_PA_VLAN. * Do not use hdr in this case (it's NULL)! */ if (info->offset != AVM_PA_OFFSET_NOT_SET) { hdrunion_t *hdr = (hdrunion_t *) (HDRCOPY(match) + info->offset); (*fprintffunc)(arg, "Vlan ID : %d\n", VLAN_ID(&hdr->vlanh)); } else { (*fprintffunc)(arg, "Vlan* ID : %d\n", match->vlan_tci&VLAN_VID_MASK); } } static void pa_show_pkt_bridge_match(struct avm_pa_pkt_match *match, pa_fprintf fprintffunc, void *arg) { char buf[128]; struct avm_pa_match_info *p; struct vlan_ethhdr *ethh; pkttype2str(match->pkttype & AVM_PA_PKTTYPE_IP_MASK, buf, sizeof(buf)); (*fprintffunc)(arg, "%-15s: %s\n", "PktType", buf); if ((p = pa_find_eth_match(match)) == NULL) return; ethh = (struct vlan_ethhdr *) (HDRCOPY(match) + p->offset); (*fprintffunc)(arg, "%-15s: %pM %pM\n", "Eth Addr", ethh->h_dest, ethh->h_source); (*fprintffunc)(arg, "%-15s: %08x\n", "Key", pa_bkey(ethh, match->vlan_tci)); } static void pa_show_pkt_full_match(struct avm_pa_pkt_match *match, u16 egress_pkttype, pa_fprintf fprintffunc, void *arg) { char buf[128]; const char *prompt = "PktType"; unsigned n; int s; if (egress_pkttype && egress_pkttype != match->pkttype) { size_t half = sizeof(buf)/2; pkttype2str(match->pkttype, buf, half); pkttype2str(egress_pkttype, buf+half, half); (*fprintffunc)(arg, "%-15s: %s -> %s\n", prompt, buf, buf+half); } else { pkttype2str(match->pkttype, buf, sizeof(buf)); (*fprintffunc)(arg, "%-15s: %s\n", prompt, buf); } (*fprintffunc)(arg, "Protocol : %04X\n", ntohs(match->protocol)); (*fprintffunc)(arg, "%-15s: %d (stored %d)\n", "Header len", match->full_hdrlen, match->hdrlen); for (n=0; n < match->nmatch; n++) { struct avm_pa_match_info *p = match->match+n; hdrunion_t *hdr = (hdrunion_t *) (HDRCOPY(match) + p->offset); switch (p->type) { case AVM_PA_ETH: s = mac2str(&hdr->ethh.h_dest, buf, sizeof(buf)); buf[s++] = ' '; mac2str(&hdr->ethh.h_source, &buf[s], sizeof(buf) - s); (*fprintffunc)(arg, "Eth Hdr : %s proto %04X\n", buf, ntohs(hdr->ethh.h_proto)); break; case AVM_PA_VLAN: /* VLAN match can come from the payload or skb->vlan_tci */ pa_show_vlan_match(match, p, fprintffunc, arg); break; case AVM_PA_PPPOE: (*fprintffunc)(arg, "PPPoE Sid : %04X [hdroff %d]\n", ntohs(hdr->pppoeh.sid), match->pppoe_offset); break; case AVM_PA_PPP: (*fprintffunc)(arg, "PPP Proto : %02X\n", hdr->ppph[0]); break; case AVM_PA_IPV4: s = in_addr2str(&hdr->iph.saddr, buf, sizeof(buf)); buf[s++] = ' '; in_addr2str(&hdr->iph.daddr, &buf[s], sizeof(buf) - s); (*fprintffunc)(arg, "IPv4 Hdr : %s proto %d tos %02X [hdroff %d]\n", buf, hdr->iph.protocol, ipv4_get_dsfield(&hdr->iph), match->ip_offset); break; case AVM_PA_IPV6: s = in6_addr2str(&hdr->ipv6h.saddr, buf, sizeof(buf)); buf[s++] = ' '; in6_addr2str(&hdr->ipv6h.daddr, &buf[s], sizeof(buf) - s); (*fprintffunc)(arg, "IPv6 Hdr : %s proto %d tos %02X flow %05X [hdroff %d]\n", buf, hdr->ipv6h.nexthdr, ipv6_get_dsfield(&hdr->ipv6h), ntohl(hdr->ipv6_vpfl) & 0xfffff, match->ip_offset); break; case AVM_PA_PORTS: (*fprintffunc)(arg, "Ports : %d -> %d [hdroff %d]\n", ntohs(hdr->ports[0]), ntohs(hdr->ports[1]), match->l4_offset); break; case AVM_PA_ICMPV4: prompt = "ICMPv4"; switch (hdr->icmph.type) { case ICMP_ECHOREPLY: (*fprintffunc)(arg, "%-15s: echo reply id=%hu [hdroff %d]\n", prompt, hdr->icmph.un.echo.id, match->l4_offset); break; case ICMP_ECHO: (*fprintffunc)(arg, "%-15s: echo request id=%hu [hdroff %d]\n", prompt, hdr->icmph.un.echo.id, match->l4_offset); break; default: (*fprintffunc)(arg, "??????\n"); break; } break; case AVM_PA_ICMPV6: prompt = "ICMPv6"; switch (hdr->icmpv6h.icmp6_type) { case ICMPV6_ECHO_REQUEST: (*fprintffunc)(arg, "%-15s: echo request id=%hu [hdroff %d]\n", prompt, hdr->icmpv6h.icmp6_identifier, match->l4_offset); break; case ICMPV6_ECHO_REPLY: (*fprintffunc)(arg, "%-15s: echo reply id=%hu [hdroff %d]\n", prompt, hdr->icmpv6h.icmp6_identifier, match->l4_offset); break; default: (*fprintffunc)(arg, "??????\n"); break; } break; case AVM_PA_LLC_SNAP: (*fprintffunc)(arg, "LLC SNAP : %04X\n", ntohs(hdr->llcsnap.type)); break; case AVM_PA_LISP: (*fprintffunc)(arg, "LISP : data header [hdroff %d]\n", match->lisp_offset); break; case AVM_PA_L2TP: (*fprintffunc)(arg, "L2TP Sess : %lu\n", (unsigned long)ntohl(hdr->l2tp.session_id)); break; case AVM_PA_GRE: (*fprintffunc)(arg, "GRE Proto : %04X\n", ntohs(hdr->greh.protocol)); break; case AVM_PA_ESP: (*fprintffunc)(arg, "ESP SPI : 0x%08X [hdroff %d]\n", ntohl(hdr->esph.spi), match->l4_offset); break; } } } static void pa_show_pkt_match(struct avm_pa_pkt_match *match, bool bridged, u16 egress_pkttype, pa_fprintf fprintffunc, void *arg) { if (bridged) pa_show_pkt_bridge_match(match, fprintffunc, arg); else pa_show_pkt_full_match(match, egress_pkttype, fprintffunc, arg); } static void pa_show_pkt_info(struct avm_pa_pkt_info *info, pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; (*fprintffunc)(arg, "In Pid : %d (%s)\n", info->ingress_pid_handle, PA_PID(ctx, info->ingress_pid_handle)->cfg.name); if (info->ingress_vpid_handle) { (*fprintffunc)(arg, "In VPid : %d (%s)\n", info->ingress_vpid_handle, PA_VPID(ctx, info->ingress_vpid_handle)->cfg.name); } if (info->egress_vpid_handle) { (*fprintffunc)(arg, "Out VPid : %d (%s)\n", info->egress_vpid_handle, PA_VPID(ctx, info->egress_vpid_handle)->cfg.name); } if (info->routed) (*fprintffunc)(arg, "Routed : yes\n"); if (info->shaped) (*fprintffunc)(arg, "Shaped : yes\n"); pa_show_pkt_match(&info->match, 0, 0, fprintffunc, arg); } static void pa_show_bsession(struct avm_pa_bsession *bsession, pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session = PA_SESSION(&pa_data, bsession->session_handle); struct avm_pa_egress *egress; unsigned negress; (*fprintffunc)(arg, "Session : %d\n", bsession->session_handle); (*fprintffunc)(arg, "In Pid : %d (%s)\n", session->ingress_pid_handle, PA_PID(ctx, session->ingress_pid_handle)->cfg.name); (*fprintffunc)(arg, "Hash : %lu\n", (unsigned long)bsession->hash); pa_show_pkt_bridge_match(&session->ingress, fprintffunc, arg); /* In practice, negress is always 1, since multicast uses normal sessions */ negress = 0; avm_pa_for_each_egress(egress, session) { (*fprintffunc)(arg, "Egress : %d of %d\n", ++negress, session->negress); if (egress->pid_handle) { (*fprintffunc)(arg, "Out Pid : %d (%s)\n", egress->pid_handle, PA_PID(ctx, egress->pid_handle)->cfg.name); } if (egress->vpid_handle) { (*fprintffunc)(arg, "Out VPid : %d (%s)\n", egress->vpid_handle, PA_VPID(ctx, egress->vpid_handle)->cfg.name); } if (egress->destmac) pa_show_macaddr(egress->destmac, fprintffunc, arg); pa_show_vlan_match(&egress->match, NULL, fprintffunc, arg); } } static void pa_show_session(struct avm_pa_session *session, pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; char buf[max_t(size_t, KSYM_SYMBOL_LEN, 64ul)]; struct avm_pa_macaddr *destmac; struct net_device *dev; unsigned negress; struct avm_pa_egress *egress; u16 pkttype; void *handler; (*fprintffunc)(arg, "Session : %d (%d)\n", session->uniq_id, session->session_handle); { unsigned long ms, s, min; ms = jiffies_to_msecs(jiffies - session->starttime); s = ms/1000; min = s/60; if (min) (*fprintffunc)(arg, "Age : %lumin %lu.%.3lus\n", min, s%60, ms%1000); else (*fprintffunc)(arg, "Age : %lu.%.3lus\n", s, ms%1000); } { char *state; if (session->on_list < AVM_PA_LIST_MAX) { const char *why = session->why_killed ? session->why_killed : "???"; bool flushed = test_bit(PA_S_FLUSHED, &session->flags); switch (session->on_list) { case AVM_PA_LIST_ACTIVE: state = flushed ? "flushed" : "active"; break; case AVM_PA_LIST_DEAD: snprintf(buf, sizeof(buf), "dead (%s)", why); state = buf; break; case AVM_PA_LIST_FREE: state = "free"; break; default: state = "BAD STATE"; break; } } else { state = "create"; } (*fprintffunc)(arg, "State : %s\n", state); } (*fprintffunc)(arg, "In Pid : %d (%s)\n", session->ingress_pid_handle, PA_PID(ctx, session->ingress_pid_handle)->cfg.name); if (session->ingress_vpid_handle) { (*fprintffunc)(arg, "In VPid : %d (%s)\n", session->ingress_vpid_handle, PA_VPID(ctx, session->ingress_vpid_handle)->cfg.name); } if (pa_hw_pa_valid(&ctx->hardware_pa)) { bool in_hw = test_bit(PA_S_IN_HW, &session->flags); if ((in_hw || avm_pa_get_hw_session(session)) && ctx->hardware_pa.session_state) (*fprintffunc)(arg, "In HW : %s\n", (*ctx->hardware_pa.session_state)(session)); else (*fprintffunc)(arg, "In HW : %s\n", in_hw ? "yes" : "no"); } #if (defined(CONFIG_AVM_GENERIC_CONNTRACK) || defined(CONFIG_AVM_PA_GENERIC_CT)) if (session->generic_ct) { if (session->generic_ct_dir == GENERIC_CT_DIR_ORIGINAL) (*fprintffunc)(arg, "CT dir : original\n"); else (*fprintffunc)(arg, "CT dir : reply\n"); } else { (*fprintffunc)(arg, "CT : no entry\n"); } #if (defined(CONFIG_BLOG) && IS_ENABLED(CONFIG_NF_CONNTRACK)) if (session->ct) { (*fprintffunc)(arg, "nf_conntrack : %lu\n", (unsigned long) session->ct); } else { (*fprintffunc)(arg, "nf_conntrack : no entry\n"); } #endif #endif (*fprintffunc)(arg, "Realtime : %s\n", test_bit(PA_S_REALTIME, &session->flags) ? "yes" : "no"); #ifdef CONFIG_AVM_PA_RPS if (session->rps_cpu) (*fprintffunc)(arg, "RPS cpu : %d\n", session->rps_cpu - 1); #endif pa_show_pkt_match(&session->ingress, session->bsession != 0, session->mod.pkttype, fprintffunc, arg); pa_show_mod_rec(&session->mod, fprintffunc, arg); (*fprintffunc)(arg, "Hroom : %u\n", (unsigned) session->needed_headroom); (*fprintffunc)(arg, "Timeout : %hu\n", session->timeout/HZ); (*fprintffunc)(arg, "SW stats : %lu pkts, %llu bytes\n", (unsigned long)session->ingress_sw_stats.tx_pkts, (unsigned long long)session->ingress_sw_stats.tx_bytes); (*fprintffunc)(arg, "HW stats : %lu pkts, %llu bytes (validflags 0x%x)\n", (unsigned long)session->ingress_hw_stats.tx_pkts, (unsigned long long)session->ingress_hw_stats.tx_bytes, session->ingress_hw_stats.validflags); negress = 0; avm_pa_for_each_egress(egress, session) { (*fprintffunc)(arg, "Egress : %d of %d\n", ++negress, session->negress); (*fprintffunc)(arg, "Type : %s\n", egresstype2str(egress->type)); if (egress->pid_handle) { (*fprintffunc)(arg, "Out Pid : %d (%s)\n", egress->pid_handle, PA_PID(ctx, egress->pid_handle)->cfg.name); } else { (*fprintffunc)(arg, "Egress under construction\n"); continue; } if (egress->vpid_handle) { (*fprintffunc)(arg, "Out VPid : %d (%s)\n", egress->vpid_handle, PA_VPID(ctx, egress->vpid_handle)->cfg.name); } (*fprintffunc)(arg, "Mtu : %u\n", (unsigned)egress->mtu); if (egress->push_l2_len) { data2hex(HDRCOPY(&egress->match), egress->push_l2_len, buf, sizeof(buf)); (*fprintffunc)(arg, "L2 push : %s\n", buf); if (egress->pppoe_offset != AVM_PA_OFFSET_NOT_SET) { (*fprintffunc)(arg, "PPPoE off : %u\n", (unsigned)egress->pppoe_offset); (*fprintffunc)(arg, "PPPoE hlen : %u\n", (unsigned)egress->pppoe_hdrlen); } } if ((destmac = egress->destmac) != 0) pa_show_macaddr(destmac, fprintffunc, arg); pa_show_pkt_match(&egress->match, session->bsession != 0, session->mod.pkttype, fprintffunc, arg); switch (egress->type) { case avm_pa_egresstype_output: { struct avm_pa_pid *pid = PA_PID(ctx, egress->pid_handle); (*fprintffunc)(arg, "Prio : %hx:%hx\n", TC_H_MAJ(egress->output.priority)>>16, TC_H_MIN(egress->output.priority)); (*fprintffunc)(arg, "TX queue : %hu\n", egress->output.txq_id); (*fprintffunc)(arg, "TC index : %hu\n", egress->output.tc_index); if (avm_pa_pid_tack_enabled(pid)) { (*fprintffunc)(arg, "tack pkts : %u (accl acks %u)\n", pid->prioack_acks, pid->prioack_accl_acks); } } break; case avm_pa_egresstype_local: pkttype = session->ingress.pkttype; if (AVM_PA_PKTTYPE_IP_VERSION(pkttype) == 4) handler = inet_protos[AVM_PA_PKTTYPE_IPPROTO(pkttype)]->handler; else if (AVM_PA_PKTTYPE_IP_VERSION(pkttype) == 6) handler = inet6_protos[AVM_PA_PKTTYPE_IPPROTO(pkttype)]->handler; else handler = NULL; (*fprintffunc)(arg, "Proto handler : %pf\n", handler); if ((dev = egress->local.dev) != 0) { (*fprintffunc)(arg, "Input Dev : %s\n", dev->name); } else { (*fprintffunc)(arg, "Input Dev : \n"); } break; case avm_pa_egresstype_rtp: (*fprintffunc)(arg, "transmitfunc : %pF\n", egress->rtp.transmit); break; case avm_pa_egresstype_xfrm: if (IS_ENABLED(CONFIG_XFRM)) { struct dst_entry *dst = egress->xfrm.dst; struct xfrm_state *x = egress->xfrm.x; (*fprintffunc)(arg, "TC index : %hu\n", egress->xfrm.tc_index); (*fprintffunc)(arg, "XFRM dst : %pf\n", dst->input); (*fprintffunc)(arg, "XFRM output : %pf\n", x->type->output); if (x->props.family == AF_INET) { (*fprintffunc)(arg, "XFRM saddr : %pI4\n", &x->props.saddr.a4); (*fprintffunc)(arg, "XFRM daddr : %pI4\n", &x->id.daddr.a4); } else if (x->props.family == AF_INET6) { (*fprintffunc)(arg, "XFRM saddr : %pI6\n", &x->props.saddr.a6); (*fprintffunc)(arg, "XFRM daddr : %pI6\n", &x->id.daddr.a6); } else { (*fprintffunc)(arg, "XFRM saddr : ??? (family %d)\n", x->props.family); (*fprintffunc)(arg, "XFRM daddr : ??? (proto %d)\n", x->id.proto); } (*fprintffunc)(arg, "XFRM spi : 0x%08x\n", ntohl(x->id.spi)); } break; case avm_pa_egresstype_null: break; } (*fprintffunc)(arg, "SW stats : %lu pkts, %llu bytes\n", (unsigned long)egress->sw_stats.tx_pkts, (unsigned long long)egress->sw_stats.tx_bytes); (*fprintffunc)(arg, "HW stats : %lu pkts, %llu bytes\n", (unsigned long)egress->hw_stats.tx_pkts, (unsigned long long)egress->hw_stats.tx_bytes); (*fprintffunc)(arg, "Pkts : TX %lu (acks %lu)\n", (unsigned long)egress->tx_pkts, (unsigned long)egress->tcpack_pkts); } avm_pa_sg_show_session(session, fprintffunc, arg); } static void pa_session_delete_rcu(struct rcu_head *head) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session = container_of(head, struct avm_pa_session, kill_rcu); struct avm_pa_egress *egress; struct hlist_node *tmp; /* Being inside rcu callback, we don't need _rcu traversal. Instead, * we need _safe traversal since egress are removed inside the loop. */ hlist_for_each_entry_safe(egress, tmp, &session->egress_head, egress_list) { if (egress->destmac) { pa_macaddr_unlink(egress->destmac); egress->destmac = 0; } switch (egress->type) { case avm_pa_egresstype_output: if (egress->output.dst) { dst_release(egress->output.dst); egress->output.dst = 0; } break; case avm_pa_egresstype_local: if (egress->local.dst) { dst_release(egress->local.dst); egress->local.dst = 0; } break; case avm_pa_egresstype_rtp: if (egress->rtp.sk) { sock_put(egress->rtp.sk); egress->rtp.sk = 0; } break; case avm_pa_egresstype_xfrm: if (IS_ENABLED(CONFIG_XFRM)) { dst_release(egress->xfrm.dst); xfrm_state_put(egress->xfrm.x); dev_put(egress->xfrm.dev); } break; case avm_pa_egresstype_null: ctx->stats.drop_sess_del++; break; } pa_pid_put(egress->pid_handle); if (egress != &session->static_egress) pa_egress_free(egress); } pa_pid_put(session->ingress_pid_handle); spin_lock(&avm_pa_lock); pa_session_list_update(session, AVM_PA_LIST_FREE); spin_unlock(&avm_pa_lock); } static void pa_session_delete(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; const char *why = session->why_killed ? session->why_killed : "???"; if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: delete session: %s\n", why); pa_show_session(session, pa_printk, KERN_DEBUG); } BUG_ON(session->on_list != AVM_PA_LIST_DEAD); pa_session_list_delete(session); #if AVM_PA_TRACE if (ctx->dbgtrace) { struct avm_pa_pid *pid = PA_PID(ctx, session->ingress_pid_handle); pa_printk(KERN_DEBUG, "avm_pa: delete session %d (%s) %s\n", session->session_handle, pid->cfg.name, why); } #endif /* * pa_session_kill() has * - removed session from hash * - removed session from hardware pa * - removed session from generic connection tracking */ BUG_ON(!hlist_unhashed(&session->hash_list)); BUG_ON(session->bsession && !hlist_unhashed(&session->bsession->hash_list)); BUG_ON(test_bit(PA_S_IN_HW, &session->flags)); #if (defined(CONFIG_AVM_GENERIC_CONNTRACK) || defined(CONFIG_AVM_PA_GENERIC_CT)) BUG_ON(session->generic_ct); #endif /* There may be packets in-flight at this point. Defer work that prevents * transmission of such packets. */ call_rcu_bh(&session->kill_rcu, pa_session_delete_rcu); } static void pa_session_kill_rcu(struct rcu_head *head) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session = container_of(head, struct avm_pa_session, kill_rcu); if (test_and_clear_bit(PA_S_IN_HW, &session->flags)) { (*ctx->hardware_pa.remove_session)(session); pa_hw_pa_put(); } #if (defined(CONFIG_AVM_GENERIC_CONNTRACK) || defined(CONFIG_AVM_PA_GENERIC_CT)) if (session->generic_ct) { u32 session_handle = (u32)(session->session_handle); struct generic_ct *ct = session->generic_ct; session->generic_ct = 0; generic_ct_sessionid_set(ct, session->generic_ct_dir, (void *)(1U << 31 | session_handle)); generic_ct_put(ct); } #if (defined(CONFIG_BLOG) && IS_ENABLED(CONFIG_NF_CONNTRACK)) if (session->ct) { struct nf_conn *nfct; nfct = session->ct; session->ct = NULL; nf_conntrack_put(&nfct->ct_general); } #endif #endif /* * all packets that were in-flight in pa_session_kill() * should be counted here. */ avm_pa_sg_session_unlink(session); spin_lock(&avm_pa_lock); pa_session_list_update(session, AVM_PA_LIST_DEAD); spin_unlock(&avm_pa_lock); } static void pa_session_kill_nolock(struct avm_pa_session *session, const char *why) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, session->ingress_pid_handle); #if AVM_PA_TRACE if (ctx->dbgtrace) { pa_printk(KERN_DEBUG, "avm_pa: kill session %d (%s) %s\n", session->session_handle, pid->cfg.name, why); } #endif if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: kill session: %s\n", why); if (session->bsession) pa_show_bsession(session->bsession, pa_printk, KERN_DEBUG); else pa_show_session(session, pa_printk, KERN_DEBUG); } pa_session_list_delete(session); pa_session_hash_delete(pid, session); session->why_killed = why; /* There may be packets in-flight at this point. Defer work that prevents * transmission of such packets. */ call_rcu_bh(&session->kill_rcu, pa_session_kill_rcu); } static void pa_session_kill(struct avm_pa_session *session, const char *why) { spin_lock(&avm_pa_lock); pa_session_kill_nolock(session, why); spin_unlock(&avm_pa_lock); } static void pa_session_flush(struct avm_pa_session *session, const char *why) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid; #if AVM_PA_TRACE if (ctx->dbgtrace) { pid = PA_PID(ctx, session->ingress_pid_handle); pa_printk(KERN_DEBUG, "avm_pa: flush session %d (%s) %s\n", session->session_handle, pid->cfg.name, why); } #endif if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: flush session: %s\n", why); pa_show_session(session, pa_printk, KERN_DEBUG); } set_bit(PA_S_FLUSHED, &session->flags); /* will be killed on next gc */ session->why_killed = why; } /* ------------------------------------------------------------------------ */ /* -------- wall clock ---------------------------------------------------- */ /* ------------------------------------------------------------------------ */ static void pa_session_prioack_check(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_egress *egress = avm_pa_first_egress(session); if (test_bit(PA_S_PRIOACK_CHECK, &session->flags) && egress->tx_pkts > ctx->prioack_thresh_packets) { /* * Stop using TGET priority. * We need to decide if we use TACK priority or restore original priority. * (TACK priority is same as original priority, if TACK is not enabled) * 2016-10-14 calle */ unsigned long percent_ack = (egress->tcpack_pkts * 100) / egress->tx_pkts; if (ctx->dbgprioack) { pa_printk(KERN_DEBUG, "avm_pa: session %d: %lu%% TCP-ACKs (%u pkts %u ACKs) \n", session->session_handle, percent_ack, egress->tx_pkts, egress->tcpack_pkts); } if (percent_ack > ctx->prioack_ratio) set_bit(PA_S_PRIOACK_ACK, &session->flags); if (ctx->dbgprioack) { pa_printk(KERN_DEBUG, "avm_pa: session %d: priority %x:%x -> %s\n", session->session_handle, TC_H_MAJ(egress->output.priority)>>16, TC_H_MIN(egress->output.priority), test_bit(PA_S_PRIOACK_ACK, &session->flags) ? "TACK" : "NORMAL"); } /* next packet will re-create a new session based on the new priority */ set_bit(PA_S_PRIOACK_DONE, &session->flags); /* atomic test_and_clear not necessary because we're only called from the tick */ clear_bit(PA_S_PRIOACK_CHECK, &session->flags); } } static void pa_session_stats_get_diff(u32 *pkts, u64 *bytes, struct avm_pa_session_stats *last, struct avm_pa_session_stats *now) { *pkts = now->tx_pkts - last->tx_pkts; last->tx_pkts = now->tx_pkts; *bytes = now->tx_bytes - last->tx_bytes; last->tx_bytes = now->tx_bytes; } static inline unsigned int pa_get_priority(unsigned int prio) { prio &= TC_H_MIN_MASK; if (prio >= AVM_PA_MAX_PRIOS) prio = AVM_PA_MAX_PRIOS-1; return prio; } static inline unsigned int pa_get_egress_priority(struct avm_pa_egress *egress) { return pa_get_priority(egress->output.priority); } static inline unsigned int pa_get_ingress_priority(struct avm_pa_session *session) { /* * Ensure that the returned ingress priority is always in the range * [0, AVM_PA_MAX_PRIOS-1], otherwise Klocwork will complain if * the ingress priority is used as index to the VPID ingress priority * statistics array. */ return pa_get_priority(session->ingress_priority); } static inline unsigned int pa_get_ingress_priority_from_pkt_mark(u32 pkt_mark) { /* * Consider only networks for now, which are encoded as the two * most significant bytes. */ unsigned int prio = AVM_PA_INGRESS_PRIO_NET(pkt_mark); if (prio >= AVM_PA_MAX_PRIOS) { prio = AVM_PA_MAX_PRIOS-1; } return prio; } /* ------------------------------------------------------------------------ */ static void pa_session_check_pa(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; if (test_bit(PA_S_IN_HW, &session->flags) && ctx->hardware_pa.check_session) { unsigned ret = ctx->hardware_pa.check_session(session); /* Warn on unknown return codes, indicates too old avm_pa tag */ WARN_ON_ONCE(ret & ~AVM_HW_CHK_FLUSH); if (ret == AVM_HW_CHK_FLUSH) pa_session_flush(session, "void by hw"); } } static int pa_session_handle_stats(struct avm_pa_session *session) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid; struct avm_pa_egress *egress; struct avm_pa_session_stats stats; u64 bytes, hw_bytes; u32 pkts, hw_pkts; unsigned validflags; stats.validflags = 0; pa_session_stats_get_diff(&pkts, &bytes, &session->ingress_last_sw_stats, &session->ingress_sw_stats); if ( !test_bit(PA_S_IN_HW, &session->flags) || ctx->hardware_pa.session_stats == 0 || (*ctx->hardware_pa.session_stats)(session, &stats) != 0) { validflags = 0; } else { validflags = stats.validflags; } if (validflags & AVM_PA_SESSION_STATS_VALID_PKTS) hw_pkts = stats.tx_pkts; else hw_pkts = 0; if (validflags & AVM_PA_SESSION_STATS_VALID_BYTES) hw_bytes = stats.tx_bytes; else hw_bytes = 0; if (ctx->dbgstats && validflags) pr_debug("session %d valid 0x%x, %lu/%lu pkts, %llu/%llu bytes\n", session->session_handle, validflags, (unsigned long)pkts, (unsigned long)hw_pkts, bytes, hw_bytes); session->ingress_hw_stats.tx_pkts += hw_pkts; session->ingress_hw_stats.tx_bytes += hw_bytes; session->ingress_hw_stats.validflags |= validflags; avm_pa_for_each_egress(egress, session) { egress->hw_stats.tx_pkts += hw_pkts; egress->hw_stats.tx_bytes += hw_bytes; if (egress->pid_handle) { pid = PA_PID(ctx, egress->pid_handle); pid->tx_pkts += pkts + hw_pkts; } } return validflags != 0; } static void pa_tick_sessions(void) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session; struct avm_pa_session_list *list = &ctx->sess_list[AVM_PA_LIST_ACTIVE]; ktime_t now = ktime_get_boottime(); /* Collect stats for all sessions, detecting possible timeouts in hardware. * This runs lockless. */ rcu_read_lock(); list_for_each_entry_rcu(session, &list->sessions, session_list) { session->stats_timestamp = now; pa_session_check_pa(session); if (pa_session_handle_stats(session)) pa_session_update(session); pa_session_prioack_check(session); } rcu_read_unlock(); } static void pa_tick_session_gc_nolock(int force) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd __maybe_unused = &pa_data; struct avm_pa_session *session, *next; struct avm_pa_session_list *list; struct avm_pa_l2tp *l2tp __maybe_unused; int i __maybe_unused; if (force) { list = &ctx->sess_list[AVM_PA_LIST_ACTIVE]; list_for_each_entry_safe(session, next, &list->sessions, session_list) { pa_session_kill_nolock(session, "disable"); ctx->stats.sess_flushed++; } } list = &ctx->sess_list[AVM_PA_LIST_DEAD]; list_for_each_entry_safe(session, next, &list->sessions, session_list) { if (avm_pa_get_hw_session(session) == NULL) { pa_session_delete(session); } } list = &ctx->sess_list[AVM_PA_LIST_ACTIVE]; list_for_each_entry_safe(session, next, &list->sessions, session_list) { if (test_bit(PA_S_FLUSHED, &session->flags)) { pa_session_kill_nolock(session, session->why_killed); } else if (time_is_before_eq_jiffies(session->endtime)) { /* flush in case a packet is received right now on another CPU, * killing immediately is racy. */ pa_session_flush(session, session->timeout ? "timeout" : "fin"); ctx->stats.sess_timedout++; } } #ifdef CONFIG_L2TP for (i = 0; i < ARRAY_SIZE(pd->l2tp_cache); i++) { struct l2tp_session *local_sess; l2tp = &pd->l2tp_cache[i]; /* We only clear the cache entry for now. * TODO: Maybe clear out corresponding sessions to truly stop forwarding */ local_sess = pa_l2tp_session_get_local(l2tp->session_id); if (local_sess == NULL) l2tp->session_id = 0; else pa_l2tp_session_put_local(local_sess); } #endif } static unsigned long last_tick; #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) static void pa_session_tick(unsigned long force) #else static void pa_session_tick(struct timer_list *timer) #endif { struct avm_pa_global *ctx = &pa_glob; unsigned long next_tick; /* Minimize timer temporal drift */ next_tick = jiffies + AVM_PA_TICK_RATE; last_tick = jiffies; pa_tick_sessions(); spin_lock(&avm_pa_lock); #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) pa_tick_session_gc_nolock(force); #else pa_tick_session_gc_nolock(0); #endif /* The tick_timer is only necessary as long as there are any sessions */ if ( ctx->sess_list[AVM_PA_LIST_ACTIVE].nsessions || ctx->sess_list[AVM_PA_LIST_DEAD].nsessions) mod_timer(&ctx->tick_timer, next_tick); spin_unlock(&avm_pa_lock); } static void pa_session_gc_once(void) { spin_lock_bh(&avm_pa_lock); pa_tick_session_gc_nolock(1); spin_unlock_bh(&avm_pa_lock); } /*------------------------------------------------------------------------ */ /* must be called inside rcu read side */ static inline void pa_tbf_forward(struct avm_pa_session *session, PKT *pkt) { /* Set the session_handle to be sure, although it's not always used during transmit. */ AVM_PKT_INFO(pkt)->session_handle = session->session_handle; AVM_PKT_INFO(pkt)->session_uniq_id = session->uniq_id; pa_do_modify_and_send(session, pkt); } /* ------------------------------------------------------------------------ */ #ifdef CONFIG_AVM_PA_RPS static bool pa_rps_reverse_array_grow(struct avm_pa_rps *rps, size_t new_size) { struct sk_buff **p; p = kmalloc_array(new_size, sizeof(struct sk_buff *), GFP_ATOMIC); if (!p) return false; /* Replace the reverse array with the larger one. The content is maintained * but towards the end of the new array as the array is filled from top to * bottom. */ memcpy(&p[rps->r_sz], &rps->r[0], rps->r_sz * sizeof(struct sk_buff *)); rps->r_sz = new_size; kfree(rps->r); rps->r = p; return true; } static void pa_rps_dequeue_task(unsigned long data) { struct avm_pa_rps *rps = (struct avm_pa_rps *) data; struct sk_buff *skb, *nskb; int i; rps->rx_dequeued++; /* atomically clear the skb queue to allow for concurrent enqueue * on the emptied queue (again, based on llist). * * Afterwards, the skb queue must be reversed because enqueue has "add_head" * semantics. We use an array instead of reversing the linked list because that * was found to be faster (260 vs 295 Mbps on a 7590 with wireguard traffic). */ skb = xchg(&rps->q, NULL); i = rps->r_sz - 1; more: while (skb && i >= 0) { nskb = skb->next; skb->next = NULL; rps->r[i--] = skb; skb = nskb; } /* Array too short, try to resize and continue filling the larger array. * Send and restart anew if allocation fails. */ if (unlikely(skb)) { size_t new_size = rps->r_sz * 2; if (pa_rps_reverse_array_grow(rps, new_size)) { /* invariant: i == -1. We continue the loop at the half of the grown array. */ i += new_size/2; goto more; } else { net_crit_ratelimited("%s realloc failed for %d bytes\n", __func__, new_size * sizeof(struct sk_buff *)); } } /* Now walk the reversed list that is back in the order of received skbs */ rcu_read_lock(); while (++i < rps->r_sz) { pa_do_modify_and_send(NULL, rps->r[i]); } rcu_read_unlock(); if (unlikely(skb)) { /* Handle allocation failure above simply by restarting with the smaller reverse * array. This leads to packet reordering and should be avoided if possible. */ i = rps->r_sz-1; goto more; } } static void pa_rps_ipi_task(unsigned long data) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_rps *rps = (struct avm_pa_rps *) data; int tcpu = rps - ctx->rps; rps->rx_rps_ipis++; /* This eventually calls pa_rps_dequeue_task() above through a * tasklet on another CPU. * * On older kernels, carefully avoid issuing an ipi if there is one in-flight * already. In this case the async call would block, risking a dead lock. * smp_call_function_single_async() sets csd.flags to CSD_FLAG_LOCK * internally to detect repeated calls itself, so we just re-use that * instead of maintaining our own guard. * * Beginning with 5.7 the kernel checks this on its own and indicates * in-flight ipi requests by returning -EBUSY. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) if (rps->csd.flags) return; #endif smp_call_function_single_async(tcpu, &rps->csd); } static inline void pa_rps_forward(struct avm_pa_session *session, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_rps *rps; u32 tcpu; struct sk_buff *qhead; /* RPS is done only in certain conditions. It's not done when * 1) rps globally disabled * 2) the session is a bridged session: * - no true flow hash recorded in the session * - questionable effect since bsessions are so cheap * 3) realtime session * - no queueing allowed * 4) rps already done once * - in dual session data paths rps may be done in the first * session already */ if (!ctx->rps_enabled || session->bsession || AVM_PKT_INFO(skb)->rps_done || test_bit(PA_S_REALTIME, &session->flags)) { pa_tbf_forward(session, skb); return; } /* Set the session_handle to be sure, although it's not always used during transmit. */ AVM_PKT_INFO(skb)->session_handle = session->session_handle; AVM_PKT_INFO(skb)->session_uniq_id = session->uniq_id; /* Don't do RPS twice, e.g. if there are two sessions for a packet. */ AVM_PKT_INFO(skb)->rps_done = 1; /* Select CPU via session hash, giving good enough distribution (hopefully) */ if (!session->rps_cpu) tcpu = session->ingress.hash & (CONFIG_AVM_PA_RPS_QUEUES-1); else tcpu = session->rps_cpu - 1; /* Ensure new CPU is online and usable. */ tcpu = cpumask_next(tcpu-1, cpu_online_mask); if (unlikely(tcpu >= min_t(u32, CONFIG_AVM_PA_RPS_QUEUES, nr_cpu_ids))) tcpu = cpumask_first(cpu_online_mask); BUG_ON(tcpu >= CONFIG_AVM_PA_RPS_QUEUES); rps = &ctx->rps[tcpu]; rps->rx_enqueued++; /* Based on llist_add_batch, inline for performance. As this has * add_head semantics the consumer must reverse the list to avoid * packet reordering. */ do { skb->next = qhead = READ_ONCE(rps->q); } while (cmpxchg(&rps->q, qhead, skb) != qhead); /* IPIs are relatively expensive. Hold IPIs up until there is * a sufficient number of packets queued up. This comes automatically * by deferring via tasklet. */ tasklet_schedule(&rps->ipi_task); } #endif /* must be called inside rcu read side */ static inline void pa_forward(struct avm_pa_session *session, struct sk_buff *skb) { #ifdef CONFIG_AVM_PA_RPS pa_rps_forward(session, skb); #else pa_tbf_forward(session, skb); #endif } /* ------------------------------------------------------------------------ */ #define MAX_TASKLET_PACKETS 32 static void avm_pa_irq_tasklet(unsigned long data) { struct avm_pa_global *ctx = &pa_glob; int count = MAX_TASKLET_PACKETS; struct sk_buff *skb; rcu_read_lock(); while (count-- > 0 && (skb = skb_dequeue(&ctx->irqqueue)) != 0) { struct avm_pa_session *session; session = pa_session_get(AVM_PKT_INFO(skb)->session_handle); /* Shouldn't happen but better play safe. */ if (session && session->uniq_id == AVM_PKT_INFO(skb)->session_uniq_id) { pa_forward(session, skb); } else { ctx->stats.fw_drop_gone++; PKT_FREE(skb); } if (AVM_PKT_INFO(skb)->l2tp_session_id != 0) { /* Just populate the cache, don't inspect packet again */ pa_l2tp_session_alloc(AVM_PKT_INFO(skb)->l2tp_session_id); AVM_PKT_INFO(skb)->l2tp_session_id = 0; } } rcu_read_unlock(); if (skb_queue_len(&ctx->irqqueue)) tasklet_schedule(&ctx->irqtasklet); } /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ void avm_pa_rx_channel_suspend(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); pid->rx_channel_stopped = 1; } EXPORT_SYMBOL(avm_pa_rx_channel_suspend); void avm_pa_rx_channel_resume(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); pid->rx_channel_stopped = 0; } EXPORT_SYMBOL(avm_pa_rx_channel_resume); void avm_pa_rx_channel_packet_not_accelerated(avm_pid_handle pid_handle, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if (avm_pa_pid_receive(pid_handle, skb) == AVM_PA_RX_ACCELERATED) return; if (likely(pid && pid->ecfg.rx_slow)) { (*pid->ecfg.rx_slow)(pid->ecfg.rx_slow_arg, skb); return; } PKT_FREE(skb); ctx->stats.rx_channel_no_rx_slow++; } EXPORT_SYMBOL(avm_pa_rx_channel_packet_not_accelerated); void avm_pa_tx_channel_accelerated_packet(avm_pid_handle pid_handle, avm_session_handle session_handle, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session; rcu_read_lock_bh(); session = pa_session_get(session_handle); if (session && session->on_list == AVM_PA_LIST_ACTIVE) { if ( avm_pa_first_egress(session)->type == avm_pa_egresstype_xfrm && avm_pa_first_egress(session)->xfrm.x->km.state != XFRM_STATE_VALID) { pa_session_flush(session, "xfrm state invalid"); goto drop; } /* Set some important skb fields, as pa_do_modify_non_l2() would have done */ skb_vlan_tag_clear(skb); skb_reset_mac_header(skb); if (session->bsession == 0) { skb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); if (session->mod.outer_ipversion == 4) { pskb_trim(skb, ntohs(PA_IPTOTLEN(skb->data))); skb_set_transport_header(skb, session->mod.v4_mod.iphlen); } else if (session->mod.outer_ipversion == 6) { pskb_trim(skb, ntohs(PA_IP6_PAYLOADLEN(skb->data)) + sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); } } AVM_PKT_INFO(skb)->seen = 1; AVM_PKT_INFO(skb)->already_modified = 1; pa_forward(session, skb); } else { drop: ctx->stats.tx_channel_dropped++; PKT_FREE(skb); } rcu_read_unlock_bh(); } EXPORT_SYMBOL(avm_pa_tx_channel_accelerated_packet); /* Validate packet and hand it over to the transmit path. * * Return code: * - AVM_PA_RX_OK if the packet could be accelerated but session became * invalid/stale. It would make sense to evaluate the packet for a new session. * - AVM_PA_RX_BYPASS if the packet is cannot be accelerated no matter what. */ static int pa_try_accelerate(struct avm_pa_pid *pid, struct avm_pa_session *session, bool bsession, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pkt_info *info = AVM_PKT_INFO(skb); struct avm_pa_pkt_match *match = &session->ingress; int header_offset; int headlen; char *head; /* As long as we don't fully grab the packet we must not move skb->data. But * session information is based on the actual header and packet size validation * can only use session information. Therefore we must account for the offset. */ if (pid->ingress_framing == avm_pa_framing_dev) header_offset = skb->data - skb_mac_header(skb); else if (pid->ingress_framing == avm_pa_framing_ipdev) header_offset = skb->data - skb_network_header(skb); else header_offset = 0; if (bsession) goto bridged; head = skb->data - header_offset; headlen = skb_headlen(skb) + header_offset; if ( avm_pa_first_egress(session)->type == avm_pa_egresstype_xfrm && avm_pa_first_egress(session)->xfrm.x->km.state != XFRM_STATE_VALID) { pa_session_flush(session, "xfrm state invalid"); return AVM_PA_RX_OK; } if (test_and_clear_bit(PA_S_PRIOACK_DONE, &session->flags)) { /* This is the first packet after completed prioack_check. Force slow * path so that a new session (based on the new priority) can be * created. Furthermore, that session may use a different egress if * that can be offloaded to hw. Flag the packet such that prioack won't * be attempted again. */ if (test_bit(PA_S_PRIOACK_ACK, &session->flags)) AVM_PKT_INFO(skb)->prioack_result = PRIOACK_ACK; else AVM_PKT_INFO(skb)->prioack_result = PRIOACK_NORMAL; pa_session_flush(session, "prioack done"); return AVM_PA_RX_OK; } if (match->pkttype & AVM_PA_PKTTYPE_LISP) { void *slhdr = LISPDATAHDR(match); void *ilhdr = head + match->lisp_offset; if (memcmp(slhdr, ilhdr, LISP_DATAHDR_SIZE) != 0) { pa_session_flush(session, "lisp data header changed"); ctx->stats.rx_lispchanged++; return AVM_PA_RX_OK; } } /* Too small packets or too little headroom are systematically wrong. * They must be fixed in the at the PID side. Therefore the errors are always * printed. */ if (headlen < match->full_hdrlen) { ctx->stats.rx_too_small++; if (net_ratelimit()) pr_err("avm_pa: pid %u (%s): too small packet: %d (need %d)\n", pid->pid_handle, pid->cfg.name, headlen, match->full_hdrlen); return AVM_PA_RX_BYPASS; } if ((skb_headroom(skb) - header_offset) < session->needed_headroom) { ctx->stats.rx_headroom_too_small++; if (net_ratelimit()) pr_err("avm_pa: pid %u (%s): too little headroom: %d (need %d)\n", pid->pid_handle, pid->cfg.name, skb_headroom(skb) - header_offset, session->needed_headroom); return AVM_PA_RX_BYPASS; } if (pa_egress_size_check(session, skb, header_offset) < 0) { ctx->stats.rx_df++; #if AVM_PA_TRACE if (ctx->dbgtrace) pr_debug("avm_pa: %lu - avm_pa_pid_receive(%s) - %s\n", pkt_uniq_id(skb), pid->cfg.name, "size problem"); #endif return AVM_PA_RX_BYPASS; } if (AVM_PA_PKTTYPE_IPPROTO(match->pkttype) == IPPROTO_TCP) { /* fast check for tcp control flags */ struct tcphdr *tcph = (struct tcphdr *) (head + match->l4_offset); /* set tcp_nodata for pa_transmit() */ info->tcp_nodata = pa_match_is_tcp_nodata(match, head); if (PA_TCP_FIN_OR_RST(tcph) || session->timeout == 0) { /* Fin terminates sessions, all further packets (including acks for * fin) take the slow path. Only set the timeout to prevent * session creation by the very last ack. However, when the tcp * socket is reused (indicated by a new syn) quickly, we must create * a new session for it immediately. */ if (PA_TCP_SYN(tcph)) pa_session_flush(session, "new flow"); else info->do_not_accelerate = 1; session->timeout = 0; pa_session_update(session); return AVM_PA_RX_BYPASS; } } bridged: pa_session_update(session); if (ctx->fw_disabled) { if (session->timeout == 0) pa_session_flush(session, "fast timeout"); #if AVM_PA_TRACE if (ctx->dbgtrace) pr_debug("avm_pa: %lu - avm_pa_pid_receive(%s) - %s\n", pkt_uniq_id(skb), pid->cfg.name, "forward disabled"); #endif return AVM_PA_RX_OK; } #if AVM_PA_TRACE if (ctx->dbgtrace) pr_debug("avm_pa: %lu - avm_pa_pid_receive(%s) - %s\n", pkt_uniq_id(skb), pid->cfg.name, "accelerated"); #endif if (skb_has_frag_list(skb)) { ctx->stats.rx_frag_list += 1; } /* The packet is finally ours and we are free to move skb->data */ __skb_push(skb, header_offset); if (in_irq() || irqs_disabled()) { if (skb_queue_len(&ctx->irqqueue) > AVM_PA_MAX_IRQ_QUEUE_LEN) { ctx->stats.rx_irqdropped++; PKT_FREE(skb); } else { info->session_handle = session->session_handle; info->session_uniq_id = session->uniq_id; skb_queue_tail(&ctx->irqqueue, skb); ctx->stats.rx_irq++; tasklet_schedule(&ctx->irqtasklet); } } else { pa_forward(session, skb); } return AVM_PA_RX_ACCELERATED; } int avm_pa_pid_session_receive(avm_pid_handle pid_handle, avm_session_handle session_handle, struct sk_buff *skb) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session; bool fw_ok; int ret = AVM_PA_RX_OK; rcu_read_lock_bh(); ctx->stats.rxfw_pkts++; session = pa_session_get(session_handle); fw_ok = true; if ( !session || session->on_list != AVM_PA_LIST_ACTIVE || session->ingress_pid_handle != pid_handle) fw_ok = false; if (likely(fw_ok)) ret = pa_try_accelerate(PA_PID(ctx, pid_handle), session, !!session->bsession, skb); rcu_read_unlock_bh(); if (unlikely(ret != AVM_PA_RX_ACCELERATED)) ctx->stats.rxfw_bypass++; return ret; } EXPORT_SYMBOL(avm_pa_pid_session_receive); /* ------------------------------------------------------------------------ */ /* -------- exported functions -------------------------------------------- */ /* ------------------------------------------------------------------------ */ int avm_pa_is_enabled(void) { struct avm_pa_global *ctx = &pa_glob; return !ctx->disabled; } EXPORT_SYMBOL(avm_pa_is_enabled); void avm_pa_get_stats(struct avm_pa_stats *stats) { struct avm_pa_global *ctx = &pa_glob; memcpy(stats, &ctx->stats, sizeof(struct avm_pa_stats)); } EXPORT_SYMBOL(avm_pa_get_stats); void avm_pa_reset_stats(void) { struct avm_pa_global *ctx = &pa_glob; memset(&ctx->stats, 0, sizeof(struct avm_pa_stats)); } EXPORT_SYMBOL(avm_pa_reset_stats); void avm_pa_dev_init(struct avm_pa_dev_info *devinfo) { memset(devinfo, 0, sizeof(struct avm_pa_dev_info)); } EXPORT_SYMBOL(avm_pa_dev_init); void avm_pa_reset_skb(struct sk_buff *skb) { struct avm_pa_pkt_info *info = AVM_PKT_INFO(skb); /* Save seen flag. We must remember this, e.g. to disable * RPS for the second part of dual-session data flows. */ info->reset = info->seen; info->seen = 0; /* The remainder of the packet info is reset by the next * avm_pa_pid_receive() (if any) */ } EXPORT_SYMBOL(avm_pa_reset_skb); static int avm_pa_pid_receive(avm_pid_handle pid_handle, PKT *pkt) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); struct avm_pa_pkt_info *info; struct avm_pa_session *session; struct vlan_ethhdr *ethh; int rc; if (ctx->disabled) return AVM_PA_RX_OK; avm_simple_profiling_skb(0, pkt); info = AVM_PKT_INFO(pkt); if (info->seen) return AVM_PA_RX_OK; ctx->stats.rx_pkts++; rcu_read_lock_bh(); info->seen = 1; info->ingress_pid_handle = pid_handle; if (info->reset) { info->ingress_vpid_handle = 0; info->egress_pid_handle = 0; info->egress_vpid_handle = 0; info->is_accelerated = 0; info->prioack_result = PRIOACK_UNKOWN; info->routed = info->shaped = 0; info->session_handle = 0; info->do_not_accelerate = 0; info->already_modified = 0; #ifdef CONFIG_AVM_PA_RPS info->rps_done = 1; #endif /* CONFIG_AVM_PA_RPS */ avm_pa_skb_sg_reset(pkt); } if ((ethh = pa_get_ethhdr(pid->ingress_framing, pkt)) != 0) { if ((session = pa_bsession_search(pid, ethh, skb_vlan_tag_get(pkt))) != 0) { ctx->stats.rx_match++; rc = pa_try_accelerate(pid, session, true, pkt); goto out_unlock; } if ((pid->ecfg.flags & AVM_PA_PID_FLAG_NO_PID_CHANGED_CHECK) == 0) { u16 vlan_id = pa_get_vlan_id(pid->ingress_framing, pkt); pa_check_and_handle_ingress_pid_change(ethh->h_source, pid_handle, vlan_id); } } rc = pa_set_pkt_match(pid->ingress_framing, pid->ecfg.flags & AVM_PA_PID_FLAG_HSTART_ON_INGRESS ? info->hstart : 0, pkt, &info->match, 0); if (rc == AVM_PA_RX_OK) { ctx->stats.rx_search++; if ((session = pa_session_search(pid, &info->match)) == 0) { #if AVM_PA_TRACE if (ctx->dbgtrace) { pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_pid_receive(%s) - %s\n", pkt_uniq_id(pkt), pid->cfg.name, "no session"); if (ctx->dbgnosession) { char buf[64]; data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_show_pkt_info(info, pa_printk, KERN_DEBUG); } } #endif if (ctx->fw_disabled || avm_pa_capture_running()) { #if AVM_PA_TRACE if (ctx->dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_pid_receive(%s) - %s\n", pkt_uniq_id(pkt), pid->cfg.name, "forward disabled"); #endif info->do_not_accelerate = 1; } info->uniq_id = atomic_inc_return(&ctx->ingress_uniq_id); rc = AVM_PA_RX_OK; goto out_unlock; } #ifdef CONFIG_BLOG if (test_bit(PA_S_PRIOACK_CHECK, &session->flags)) blog_skip(pkt, blog_skip_reason_unknown); #endif ctx->stats.rx_match++; rc = pa_try_accelerate(pid, session, false, pkt); goto out_unlock; } if (ctx->dbgmatch) { char buf[64]; pa_printk(KERN_DEBUG, "---------->\n"); pa_printk(KERN_DEBUG, "RC : %d %s\n", rc, rc2str(rc)); data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_show_pkt_info(info, pa_printk, KERN_DEBUG); pa_printk(KERN_DEBUG, "<----------\n"); } pa_reset_match(&info->match); switch (rc) { case AVM_PA_RX_TTL: ctx->stats.rx_ttl++; break; case AVM_PA_RX_BROADCAST: ctx->stats.rx_broadcast++; break; default: ctx->stats.rx_bypass++; break; } #if AVM_PA_TRACE if (ctx->dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_pid_receive(%s) - %s (rc %d)\n", pkt_uniq_id(pkt), pid->cfg.name, "bypass", rc); #endif out_unlock: rcu_read_unlock_bh(); return rc; } static inline void avm_pa_vpid_snoop_receive(avm_vpid_handle handle, PKT *pkt) { #if AVM_PA_TRACE struct avm_pa_global *ctx = &pa_glob; if (ctx->dbgtrace) { struct avm_pa_vpid *vpid = PA_VPID(ctx, handle); pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_vpid_snoop_receive(%s)\n", pkt_uniq_id(pkt), vpid->cfg.name); } #endif AVM_PKT_INFO(pkt)->ingress_vpid_handle = handle; } int avm_pa_dev_local_out(struct avm_pa_dev_info *devinfo, PKT *pkt) { int rc = AVM_PA_RX_OK; if (devinfo->pid_handle) { AVM_PKT_INFO(pkt)->realtime = 1; /* avoid rps and other queues */ rc = avm_pa_pid_receive(devinfo->pid_handle, pkt); if (rc == AVM_PA_RX_ACCELERATED) return rc; /* Do not create local out sessions for multicast, it may prevent * local delivery. * See JZ-99559: FRITZ!Fon findet den FRITZ!Media Server nicht * * Unicast local-to-local is not permitted either, see snoop_transmit. */ if (AVM_PKT_INFO(pkt)->match.casttype != AVM_PA_IS_UNICAST) avm_pa_do_not_accelerate(pkt); } if (devinfo->vpid_handle) avm_pa_vpid_snoop_receive(devinfo->vpid_handle, pkt); return rc; } int avm_pa_dev_receive(struct avm_pa_dev_info *devinfo, PKT *pkt) { int rc = AVM_PA_RX_OK; if (devinfo->pid_handle) { rc = avm_pa_pid_receive(devinfo->pid_handle, pkt); if (rc == AVM_PA_RX_ACCELERATED) return rc; } if (devinfo->vpid_handle) avm_pa_vpid_snoop_receive(devinfo->vpid_handle, pkt); return rc; } EXPORT_SYMBOL(avm_pa_dev_receive); int avm_pa_dev_pid_receive(struct avm_pa_dev_info *devinfo, PKT *pkt) { struct avm_pa_global *ctx = &pa_glob; int rc = AVM_PA_RX_OK; avm_simple_profiling_skb(0, pkt); if (devinfo->pid_handle) { struct avm_hardware_pa *hwpa = &ctx->hardware_pa; /* We must be careful here since try_to_accelerate might be module code * that could be unloaded our the back. Therefore we must get an explicit * ref on the hardware_pa since we aren't tied to a session yet. */ if (hwpa->try_to_accelerate && !ctx->hw_ppa_disabled && pa_hw_pa_get()) { struct avm_pa_pid *pid = PA_PID(ctx, devinfo->pid_handle); if (pid->rx_channel_activated) { if (pid->rx_channel_stopped == 0) { rc = hwpa->try_to_accelerate(devinfo->pid_handle, pkt); } else { ctx->stats.rx_channel_stopped++; } } pa_hw_pa_put(); } if (rc == AVM_PA_RX_OK) { /* Try to lookup session unless try_to_accelerate() determines invalid packet. */ rc = avm_pa_pid_receive(devinfo->pid_handle, pkt); } else if (rc >= AVM_PA_RX_BYPASS) { /* Other avm_pa_pid_receive() calls down the road shall not inspect the packet. */ AVM_PKT_INFO(pkt)->seen = 1; } else if (rc < 0) { /* packet dropped due to an error */ rc = AVM_PA_RX_STOLEN; } } return rc; } EXPORT_SYMBOL(avm_pa_dev_pid_receive); void avm_pa_dev_vpid_snoop_receive(struct avm_pa_dev_info *devinfo, PKT *pkt) { if (devinfo->vpid_handle) avm_pa_vpid_snoop_receive(devinfo->vpid_handle, pkt); } EXPORT_SYMBOL(avm_pa_dev_vpid_snoop_receive); void avm_pa_mark_routed(PKT *pkt) { AVM_PKT_INFO(pkt)->routed = 1; #if AVM_PA_TRACE if (pa_glob.dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_mark_routed (ingress %d)\n", pkt_uniq_id(pkt), AVM_PKT_INFO(pkt)->ingress_pid_handle); #endif } EXPORT_SYMBOL(avm_pa_mark_routed); void avm_pa_mark_shaped(PKT *pkt) { AVM_PKT_INFO(pkt)->shaped = 1; #if AVM_PA_TRACE if (pa_glob.dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - %s (ingress %d)\n", pkt_uniq_id(pkt), __func__, AVM_PKT_INFO(pkt)->ingress_pid_handle); #endif } EXPORT_SYMBOL(avm_pa_mark_shaped); void avm_pa_skb_set_rps(struct sk_buff *skb, const struct cpumask *allow, const struct cpumask *fallback) { #ifdef CONFIG_AVM_PA_RPS AVM_PKT_INFO(skb)->rps_override = 1; AVM_PKT_INFO(skb)->rps_allowed_mask = *allow; AVM_PKT_INFO(skb)->rps_fallback_mask = *fallback; #if AVM_PA_TRACE if (pa_glob.dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - %s (ingress %d)\n", pkt_uniq_id(skb), __func__, AVM_PKT_INFO(skb)->ingress_pid_handle); #endif #endif } EXPORT_SYMBOL(avm_pa_skb_set_rps); void avm_pa_use_protocol_specific_session(PKT *pkt) { AVM_PKT_INFO(pkt)->use_protocol_specific = 1; #if AVM_PA_TRACE if (pa_glob.dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_use_protocol_specific_session (ingress %d)\n", pkt_uniq_id(pkt), AVM_PKT_INFO(pkt)->ingress_pid_handle); #endif } EXPORT_SYMBOL(avm_pa_use_protocol_specific_session); void avm_pa_do_not_accelerate(PKT *pkt) { AVM_PKT_INFO(pkt)->do_not_accelerate = 1; #if AVM_PA_TRACE if (pa_glob.dbgtrace) pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_do_not_accelerate\n", pkt_uniq_id(pkt)); #endif } EXPORT_SYMBOL(avm_pa_do_not_accelerate); void avm_pa_set_hstart(PKT *pkt, unsigned int hstart) { AVM_PKT_INFO(pkt)->hstart = hstart; } EXPORT_SYMBOL(avm_pa_set_hstart); static inline void avm_pa_vpid_snoop_transmit(avm_vpid_handle handle, PKT *pkt) { struct avm_pa_pkt_info *info = AVM_PKT_INFO(pkt); if (info->egress_vpid_handle == 0) info->egress_vpid_handle = handle; #if AVM_PA_TRACE if (pa_glob.dbgtrace) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_vpid *vpid = PA_VPID(ctx, handle); pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_vpid_snoop_transmit(%s)\n", pkt_uniq_id(pkt), vpid->cfg.name); } #endif } static inline unsigned int pa_calc_tack_priority(struct avm_pa_pkt_info *info, struct avm_pa_pid *epid, unsigned int orig_priority) { unsigned int newprio = orig_priority; if (AVM_PA_PKTTYPE_IPPROTO(info->match.pkttype) == IPPROTO_TCP) { unsigned int prio; prio = avm_pa_pid_tack_enabled(epid) ? avm_pa_pid_tack_prio(epid, orig_priority) : 0; if (prio != 0 && prio < newprio) newprio = prio; } return newprio; } static inline unsigned int pa_calc_start_priority(struct avm_pa_pkt_info *info, struct avm_pa_pid *epid, unsigned int orig_priority) { /* * We calculate the priority to use, when session is created. * We assume it's an TGET or TACK session. The final decision will be made in * pa_session_prioack_check(). * 2016-10-14 calle */ unsigned int newprio = orig_priority; if (AVM_PA_PKTTYPE_IPPROTO(info->match.pkttype) == IPPROTO_TCP) { unsigned int prio; prio = avm_pa_pid_tget_enabled(epid) ? avm_pa_pid_tget_prio(epid, orig_priority) : 0; if (prio != 0 && prio < newprio) newprio = prio; prio = avm_pa_pid_tack_enabled(epid) ? avm_pa_pid_tack_prio(epid, orig_priority) : 0; if (prio != 0 && prio < newprio) newprio = prio; } return newprio; } static inline int avm_pa_pid_snoop_transmit(avm_pid_handle pid_handle, PKT *pkt, enum avm_pa_egresstype etype, void *edata) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; struct avm_pa_pkt_info *info = AVM_PKT_INFO(pkt); struct avm_pa_session *session, *take_over_session; struct avm_pa_egress *egress; struct avm_pa_pkt_match match; struct avm_pa_pid *ipid, *epid; struct avm_pa_vpid *evpid; struct vlan_ethhdr *ethh; int headroom; char buf[64]; int ret; struct sock *sk = NULL; struct xfrm_state *x = NULL; bool tcp_syn, tcp_ack, tcp_fin, tcp_nodata; u16 vlan_id, ingress_vlan_id, is_pvid; #if AVM_PA_TRACE if (ctx->dbgtrace) { epid = PA_PID(ctx, pid_handle); pa_printk(KERN_DEBUG, "avm_pa: %lu - avm_pa_pid_snoop_transmit(%s)\n", pkt_uniq_id(pkt), epid->cfg.name); } #endif if (ctx->disabled) return AVM_PA_TX_OK; tcp_syn = tcp_fin = tcp_ack = tcp_nodata = false; rcu_read_lock_bh(); epid = PA_PID(ctx, pid_handle); if (info->do_not_accelerate) { ctx->stats.tx_bypass++; if (ctx->dbgnosession) { pa_printk(KERN_DEBUG, "Bypass : do not accelerate\n"); data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_printk(KERN_DEBUG, "---------------\n"); } goto tx_bypass; } if (info->ingress_pid_handle == 0) { ctx->stats.tx_local++; goto tx_bypass; } if (etype == avm_pa_egresstype_local) { sk = edata; if (info->ingress_pid_handle == pid_handle) { /* Traffic through "lo" interface triggers sessions * but we don't want to waste sessions on local IPC traffic. */ ctx->stats.tx_loopback++; goto tx_bypass; } } else if (etype == avm_pa_egresstype_xfrm) { x = edata; /* skip if no xfrm_state is given or there's an additional udp encap */ if (!x || x->encap || !IS_ENABLED(CONFIG_XFRM)) { ctx->stats.tx_bypass++; goto tx_bypass; } } ipid = PA_PID(ctx, info->ingress_pid_handle); ethh = pa_get_ethhdr(epid->egress_framing, pkt); take_over_session = NULL; if (info->session_handle != 0) { BUG_ON(info->egress_pid_handle == 0); if (info->egress_pid_handle != pid_handle) { take_over_session = PA_SESSION(pd, info->session_handle); } else { ctx->stats.tx_already++; goto tx_bypass; } } ret = pa_egress_precheck(epid, pkt, &info->match, &match); if (ret != AVM_PA_RX_OK) { ctx->stats.tx_bypass++; if (ctx->dbgnosession) { pa_printk(KERN_DEBUG, "Bypass : precheck failed (%d)\n", ret); data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_printk(KERN_DEBUG, "<- pkt_info ->\n"); pa_show_pkt_info(info, pa_printk, KERN_DEBUG); pa_printk(KERN_DEBUG, "<- pkt_match ->\n"); pa_show_pkt_full_match(&match, 0, pa_printk, KERN_DEBUG); pa_printk(KERN_DEBUG, "---------------\n"); } goto tx_bypass; } /* Bypass SYN packets but allow session creation by SYN+ACK (no payload), * such that the first data segment can be accelerated. After TCP handshake * any packet except FIN (or RST) can create sessions. */ if (AVM_PA_PKTTYPE_IPPROTO(match.pkttype) == IPPROTO_TCP) { struct tcphdr *tcph; u8 *data; switch (epid->egress_framing) { case avm_pa_framing_ipdev: case avm_pa_framing_ptype: data = skb_network_header(pkt); break; case avm_pa_framing_dev: data = skb_mac_header(pkt); break; default: data = pkt->data; break; } tcph = (struct tcphdr *) (data + match.l4_offset); tcp_syn = PA_TCP_SYN(tcph); tcp_fin = PA_TCP_FIN_OR_RST(tcph); tcp_ack = PA_TCP_ACK(tcph); tcp_nodata = pa_match_is_tcp_nodata(&match, data); if ((tcp_syn && !tcp_ack) || tcp_fin) { ctx->stats.tx_bypass++; if (ctx->dbgnosession) { pa_printk(KERN_DEBUG, "Bypass : %s\n", tcp_fin ? "Fin" : "Syn"); data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_printk(KERN_DEBUG, "<- pkt_info ->\n"); pa_show_pkt_info(info, pa_printk, KERN_DEBUG); pa_printk(KERN_DEBUG, "---------------\n"); } goto tx_bypass; } } vlan_id = pa_get_vlan_match(&match) & VLAN_VID_MASK; ingress_vlan_id = pa_get_vlan_match(&info->match) & VLAN_VID_MASK; /* If vlan is the same ingress and egress then we consider this "non-pvid" */ is_pvid = vlan_id != ingress_vlan_id; /* This won't find bridge sessions which will create duplicate sessions. * Well, temporarly as they don't get past pa_session_activate(). * * In case of pid take over, this would find the existing session, thus not * proceed with session creation. But we do need to try that to decide * whether to take over or not. */ if (take_over_session || !(session = pa_session_search(ipid, &info->match))) { int (*probe_session)(struct avm_pa_session *avm_session); int (*add_session)(struct avm_pa_session *avm_session); int (*add_session_skb)(struct avm_pa_session *avm_session, struct sk_buff *skb); int hw_ok; bool __maybe_unused rps_ok; /* Grab temporary references for use during CREATE state. * If the session fails to reach ACTIVE state, then pa_session_kill() will take * care of these. Otherwise pa_session_activate() will render them permanent. * * In any case we don't have to release them ourselves once we have both. */ avm_pid_handle ingress_pid_handle = pa_pid_get(info->ingress_pid_handle); avm_pid_handle egress_pid_handle = pa_pid_get(pid_handle); if (likely(ingress_pid_handle && egress_pid_handle) || take_over_session) session = pa_session_alloc(&info->match); if (!session) { /* Maybe we couldn't ref a PID, release the other one */ if (ingress_pid_handle) pa_pid_put(ingress_pid_handle); if (egress_pid_handle) pa_pid_put(egress_pid_handle); if (sk) ctx->stats.local_sess_error++; else ctx->stats.tx_sess_error++; ret = AVM_PA_TX_ERROR_SESSION; goto out; } /* Session State: CREATE */ session->ingress_uniq_id = info->uniq_id; session->ingress_pid_handle = ingress_pid_handle; session->ingress_vpid_handle = info->ingress_vpid_handle; session->ingress_priority = pa_get_ingress_priority_from_pkt_mark(pkt->mark); if (info->routed) set_bit(PA_S_ROUTED, &session->flags); if (info->no_hw) set_bit(PA_S_NO_HW, &session->flags); if (info->realtime) set_bit(PA_S_REALTIME, &session->flags); egress = avm_pa_first_egress(session); egress->pid_handle = egress_pid_handle; egress->vpid_handle = info->egress_vpid_handle; egress->match = match; egress->type = etype; switch (etype) { case avm_pa_egresstype_local: egress->local.dev = pkt->dev; egress->local.dst = dst_clone(skb_dst(pkt)); egress->local.skb_iif = pkt->skb_iif; rps_ok = false; break; case avm_pa_egresstype_xfrm: if (IS_ENABLED(CONFIG_XFRM)) { dev_hold(pkt->dev); xfrm_state_hold(x); egress->xfrm.dev = pkt->dev; egress->xfrm.x = x; egress->xfrm.dst = dst_clone(skb_dst(pkt)); /* Ensure tx_arg == NULL since we always pass the xfrm_state */ BUG_ON(PA_PID(ctx, egress->pid_handle)->cfg.tx_arg != NULL); egress->xfrm.tc_index = pkt->tc_index; rps_ok = true; } break; case avm_pa_egresstype_null: rps_ok = false; break; default: egress->output.dst = skb_dst(pkt) ? dst_clone(skb_dst(pkt)) : NULL; egress->output.priority = pkt->priority; egress->output.txq_id = skb_get_queue_mapping(pkt); egress->output.tc_index = pkt->tc_index; egress->output.skb_iif = pkt->skb_iif; egress->output.mac_len = pkt->mac_len; #ifdef CONFIG_TI_PACKET_PROCESSOR egress->output.puma_pktinfo = *SKB_GET_PP_INFO_P(pkt); #ifdef CONFIG_TI_META_DATA egress->output.ti_meta_info = pkt->ti_meta_info; egress->output.ti_meta_info2 = pkt->ti_meta_info2; #endif #endif rps_ok = true; break; } #ifdef CONFIG_AVM_PA_RPS /* For local sessions we try to keep it on the same CPU as the receiving * process. For now we assume the kernel has already selected the best cpu * and follow its decision. If RPS was configured explicitly via * avm_pa_skb_set_rps() then we commit to that CPU at session creation. * Likewise, for drop sessions, we want to drop on the receiving CPU as * there is no further packet processing. * * Otherwise, CPU selection (via hash based on the flow) is deferred * to the fast path because there may be multiple flows within a single * bridge session. */ if (info->rps_override || !rps_ok) { int cpu = info->match.hash & (CONFIG_AVM_PA_RPS_QUEUES-1); if (!rps_ok) cpu = smp_processor_id(); if (info->rps_override) { if (!cpumask_test_cpu(cpu, &info->rps_allowed_mask)) { cpu = cpumask_any_but(&info->rps_fallback_mask, cpu); if (cpu >= nr_cpu_ids) { cpu = cpumask_first(&info->rps_fallback_mask); if (cpu >= nr_cpu_ids) cpu = smp_processor_id(); /* RPS disabled */ } } } session->rps_cpu = cpu + 1; } #endif /* Bridged session are more efficient, but subject to a few restrictions: * - ethernet header must match, and nothing else * - packets must be bridged, not routed (obviously) * - must be unicast as broadcast/multicast means multiple egress, which might require * different framings or even local input, which make plain bridging impossible * - avm_pa_use_protocol_specific_session() wasn't used to enforce normal sessions * - ingress and egress pids permit bridged sessions * - hardware_pa permits bridged sessions * - bridged sessions aren't disallowed through procfs interface * If all conditions are met, bridged sessions can use a few shortcuts such * as skipping data modification entirely. */ if ( ethh && ctx->bsession_allowed && info->routed == 0 && info->match.casttype == AVM_PA_IS_UNICAST && info->use_protocol_specific == 0 && (ipid->bridging_ok && epid->bridging_ok) && (ctx->hw_ppa_disabled || !(ctx->hardware_pa.flags & AVM_HW_F_NO_BSESSION)) && pa_match_bridged(&info->match, &egress->match)) { pa_change_to_bridge_session(session); egress->pppoe_offset = AVM_PA_OFFSET_NOT_SET; egress->push_l2_len = 0; egress->mtu = 0xffff; } else { /* Carefully avoid doing prioack repeatedly for the same flow. */ if (egress->type == avm_pa_egresstype_output) { u32 priority; if (info->prioack_result == PRIOACK_UNKOWN) { priority = pa_calc_start_priority(info, epid, pkt->priority); if (pkt->priority != priority) { set_bit(PA_S_PRIOACK_CHECK, &session->flags); /* pa_session_prioack_check() will check priority */ #ifdef CONFIG_BLOG blog_skip(pkt, blog_skip_reason_unknown); #endif if (ctx->dbgprioack) { pa_printk(KERN_DEBUG, "avm_pa: session %d: priority %x:%x TGET (orignal %x:%x)\n", session->session_handle, TC_H_MAJ(priority)>>16, TC_H_MIN(priority), TC_H_MAJ(pkt->priority)>>16, TC_H_MIN(pkt->priority)); } } } else if (info->prioack_result == PRIOACK_ACK) { /* An earlier session has determined this is an ACK flow, * therefore this new session is using tack priority. */ priority = pa_calc_tack_priority(info, epid, pkt->priority); } else { priority = pkt->priority; } pkt->priority = egress->output.priority = priority; #ifdef CONFIG_BLOG pkt->mark = SKBMARK_SET_Q(pkt->mark, (BROADCOM_MAX_PRIOS - (pkt->priority & TC_H_MIN_MASK))); pkt->mark = SKBMARK_SET_FLOW_ID(pkt->mark, 0x1); #endif } pa_calc_modify(session, &info->match, &match); if (match.encap_offset == AVM_PA_OFFSET_NOT_SET) egress->push_l2_len = match.ip_offset; else egress->push_l2_len = match.encap_offset; headroom = (session->mod.push_encap_len + egress->push_l2_len) - (session->mod.pull_l2_len + session->mod.pull_encap_len); if (headroom > 0 && headroom > session->needed_headroom) session->needed_headroom = headroom; egress->pppoe_offset = match.pppoe_offset; if (egress->pppoe_offset != AVM_PA_OFFSET_NOT_SET) egress->pppoe_hdrlen = egress->pppoe_offset + sizeof(struct pppoehdr); egress->mtu = epid->cfg.default_mtu; if (egress->vpid_handle) { evpid = PA_VPID(ctx, egress->vpid_handle); if (session->mod.outer_ipversion == 4) { if (evpid->cfg.v4_mtu < egress->mtu) egress->mtu = evpid->cfg.v4_mtu; } else if (session->mod.outer_ipversion == 6) { if (evpid->cfg.v6_mtu < egress->mtu) egress->mtu = evpid->cfg.v6_mtu; } } } if (ethh) egress->destmac = pa_macaddr_link(ethh->h_dest, egress_pid_handle, is_pvid, vlan_id); if (epid->ecfg.cb_len) memcpy(egress->cb, &pkt->cb[epid->ecfg.cb_start], epid->ecfg.cb_len); #if (defined(CONFIG_AVM_GENERIC_CONNTRACK) || defined(CONFIG_AVM_PA_GENERIC_CT)) if (SKB_GENERIC_CT(pkt)) { session->generic_ct = generic_ct_get(SKB_GENERIC_CT(pkt)); session->generic_ct_dir = SKB_GENERIC_CT_DIR(pkt); /* don't do generic_ct_sessionid_set() yet because the session is not * activated yet, so don't use the session_handle yet */ #if (defined(CONFIG_BLOG) && IS_ENABLED(CONFIG_NF_CONNTRACK) && defined(CONFIG_AVM_PA_GENERIC_CT_PP_SUPPORT)) /* * If the hw_pa in this case flow cache is turned on: * - register the struct generic_ct * of the datapipe flow * at the struct nf_conn * of the netfilter connection * - save the struct nf_conn * in the avm_pa session * * Note: struct nf_conn* allocated in nf_conntrack_in() */ if (!ctx->hw_ppa_disabled && skb_nfct(pkt) && (AVM_PA_PKTTYPE_IPPROTO(info->match.pkttype) == IPPROTO_TCP || AVM_PA_PKTTYPE_IPPROTO(info->match.pkttype) == IPPROTO_UDP)) { struct nf_conn *ctrack; nf_conntrack_get(skb_nfct(pkt)); ctrack = (struct nf_conn *) skb_nfct(pkt); ctrack->generic_ct = generic_ct_get(SKB_GENERIC_CT(pkt)); session->ct = (struct nf_conn *) skb_nfct(pkt); } #endif } #endif /* Hardware session offloading * * If all pre-conditions are met we first probe if the session is acceleratable at * all. Actually adding the session to hardware is done once the session * is transitioned to ACTIVE state. Before that transition there can be multiple * packets racing to create the same session. probe_session() is expected to deal * with that, on the assumption that it's mostly stateless and doesn't talk to * the hardware yet. * * In contrast, add_session() does the actual work and we do not want to * confuse that part with multiple, equal sessions. * * Pre-conditions for offloading: * a) prioack_check is NOT set. Not all HW-PA provide packet and byte counters so * we must use counters from software acceleration. * b) HW-PA is available * c) HW-PA is not disabled * d) no_hw flag was not set in AVM_PKT_INFO (eg. by sch_tack) * e) On vr9, not a local session (JZ-26496) * * In the future probe_session will become mandatory. For now it's optional * and if it's not provided we assume the session can be offloaded. */ probe_session = rcu_dereference(ctx->hardware_pa.probe_session); add_session = rcu_dereference(ctx->hardware_pa.add_session); add_session_skb = rcu_dereference(ctx->hardware_pa.add_session_skb); hw_ok = !test_bit(PA_S_PRIOACK_CHECK, &session->flags) // a && (add_session || add_session_skb) // b && !ctx->hw_ppa_disabled // c && !test_bit(PA_S_NO_HW, &session->flags); // d #ifdef CONFIG_VR9 if (etype == avm_pa_egresstype_local) hw_ok = 0; // e #endif if (hw_ok && probe_session) hw_ok = probe_session(session) != AVM_PA_TX_ERROR_SESSION; /* A lower-level pid is taking over. This creates a new session, * for many reasons: * - updating the egress would be subject to race conditions * since the session is already in state ACTIVE * - the egress match info and mod record must be parsed again * (for example, vlan may change) * - as a result, the classification as bridged session may change * - is super easy to implement (really just need to flush here) * * Keep in mind that this check is only done for the first packet * (is_accelerated == 0). * * If the current pid performs traffic shaping, this take over is * usually prevented since QoS would be bypassed. Except if the new * session would qualify for HW offloading. Then we prefer offloading * because we typically perform adequate HW-assisted QoS. * * If the current pid performs prioack check and the new pid doesn't, * then this take over is also prevented. Once prioack decision is * completed then the new session may take over, for this reason * take over is possible in the accelerated path. * */ if ( take_over_session && (hw_ok || info->shaped == 0) && !test_bit(PA_S_PRIOACK_CHECK, &take_over_session->flags)) { ctx->stats.tx_pid_change++; pa_session_flush(take_over_session, "pid take over"); } /* * The selector is asked at last, because the session is not fully setup until now. * The session framework needs complete session info to make an informed decision. */ if (ctx->filter_enabled && !avm_pa_session_is_selected(&ctx->accel_filter, session)) { ctx->stats.tx_bypass++; if (ctx->dbgnosession) { pa_printk(KERN_DEBUG, "Acceleration filtered\n"); data2hex(PKT_DATA(pkt), PKT_LEN(pkt), buf, sizeof(buf)); pa_printk(KERN_DEBUG, "Data : %s\n", buf); pa_show_pkt_info(info, pa_printk, KERN_DEBUG); } pa_session_kill(session, "filtered"); goto tx_bypass; } /* activate guarantees that only one session of a kind exists but it also * hands over the session to the lookup so that newer packets (perhaps * on another CPU) can already use this session before we return */ ret = pa_session_activate(session); if (ret != AVM_PA_TX_SESSION_ADDED) goto out; /* Session State: ACTIVE */ avm_pa_sg_session_link(session, pkt); #if AVM_PA_TRACE if (ctx->dbgtrace) { pa_printk(KERN_DEBUG, "avm_pa: add session %d (%s)\n", session->session_handle, ipid->cfg.name); } #endif if (ctx->dbgsession) { if (session->bsession) { pa_printk(KERN_DEBUG, "\navm_pa: new bsession:\n"); pa_show_bsession(session->bsession, pa_printk, KERN_DEBUG); } else { pa_printk(KERN_DEBUG, "\navm_pa: new session:\n"); pa_show_session(session, pa_printk, KERN_DEBUG); } } /* Do the actual offload. If this succeeds the session will add a reference * to the hardware pa. */ if (hw_ok && pa_hw_pa_get()) { int added = add_session_skb ? add_session_skb(session, pkt) : add_session(session); if (added == AVM_PA_TX_SESSION_ADDED) { set_bit(PA_S_IN_HW, &session->flags); } else { pa_hw_pa_put(); } } if (etype == avm_pa_egresstype_local) ctx->stats.local_sess_ok++; else if (etype == avm_pa_egresstype_null) ctx->stats.drop_sess_ok++; else if (IS_ENABLED(CONFIG_XFRM) && etype == avm_pa_egresstype_xfrm) ctx->stats.xfrm_sess_ok++; else ctx->stats.tx_sess_ok++; info->session_handle = session->session_handle; info->egress_pid_handle = pid_handle; ret = AVM_PA_TX_SESSION_ADDED; goto out; } /* It's a slow packet with existing session, this happens in case of * active packet tracing or batched rx processing (i.e. GRX) or concurrent * packet processing (e.g. hawkeye). * * We only add egress to the session if the packet was cloned from the original one. */ if (info->uniq_id != session->ingress_uniq_id) { /* ignore concurrent packet (not cloned from original skb in ingress) */ if (etype == avm_pa_egresstype_local) { ctx->stats.local_sess_exists++; } else { ctx->stats.tx_sess_exists++; } pa_session_update(session); /* use priority we decide to use for this egress */ avm_pa_for_each_egress(egress, session) { if (egress->pid_handle != pid_handle) continue; if (egress->type == avm_pa_egresstype_output) { pkt->priority = egress->output.priority; #ifdef CONFIG_BLOG if (avm_pa_pid_tack_enabled(PA_PID(ctx, egress->pid_handle))) { pkt->mark = SKBMARK_SET_Q(pkt->mark, (BROADCOM_MAX_PRIOS - (pkt->priority & TC_H_MIN_MASK))); pkt->mark = SKBMARK_SET_FLOW_ID(pkt->mark, 0x1); } #endif break; } } ret = AVM_PA_TX_SESSION_EXISTS; goto out; } /* Add egress for cloned packet. * * We don't check if the payload truly changed compared to existing egress, * if the slow path demands duplicated packets we follow suit. */ if ((egress = pa_egress_alloc()) != NULL) { u16 mtu; egress->pid_handle = pa_pid_get(pid_handle); if (unlikely(!egress->pid_handle)) { pa_egress_free(egress); goto no_egress; } egress->vpid_handle = info->egress_vpid_handle; egress->match = match; egress->type = etype; if (etype == avm_pa_egresstype_local) { egress->local.dev = pkt->dev; egress->local.dst = dst_clone(skb_dst(pkt)); egress->local.skb_iif = pkt->skb_iif; } else if (etype == avm_pa_egresstype_output) { egress->output.priority = pkt->priority; egress->output.txq_id = skb_get_queue_mapping(pkt); egress->output.tc_index = pkt->tc_index; egress->output.skb_iif = pkt->skb_iif; egress->output.mac_len = pkt->mac_len; #ifdef CONFIG_TI_PACKET_PROCESSOR egress->output.puma_pktinfo = *SKB_GET_PP_INFO_P(pkt); #endif } /* multi-egress sessions (multicast) are always full sessions, * i.e. vlan info is regular part of the session. */ if (ethh) egress->destmac = pa_macaddr_link(ethh->h_dest, egress->pid_handle, is_pvid, vlan_id); if (epid->ecfg.cb_len) memcpy(egress->cb, &pkt->cb[epid->ecfg.cb_start], epid->ecfg.cb_len); mtu = epid->cfg.default_mtu; if (egress->vpid_handle) { evpid = PA_VPID(ctx, egress->vpid_handle); if (session->mod.outer_ipversion == 4) { if (evpid->cfg.v4_mtu < mtu) mtu = evpid->cfg.v4_mtu; } else if (session->mod.outer_ipversion == 6) { if (evpid->cfg.v6_mtu < mtu) mtu = evpid->cfg.v6_mtu; } } if (session->bsession) { egress->pppoe_offset = AVM_PA_OFFSET_NOT_SET; egress->push_l2_len = 0; mtu = 0xffff; } else { /* * currently we do only TACK/TGET handling only on egress[0]. * So we keep SKBs original priority. * 2016-10-14 calle */ if (match.encap_offset == AVM_PA_OFFSET_NOT_SET) egress->push_l2_len = match.ip_offset; else egress->push_l2_len = match.encap_offset; headroom = (session->mod.push_encap_len + egress->push_l2_len) - (session->mod.pull_l2_len + session->mod.pull_encap_len); if (headroom > 0 && headroom > session->needed_headroom) session->needed_headroom = headroom; egress->pppoe_offset = match.pppoe_offset; if (egress->pppoe_offset != AVM_PA_OFFSET_NOT_SET) egress->pppoe_hdrlen = egress->pppoe_offset + sizeof(struct pppoehdr); } egress->mtu = mtu; /* Atomically add the egress, after initialization. Fixes JZ-26868. */ spin_lock(&avm_pa_lock); hlist_add_behind_rcu(&egress->egress_list, &avm_pa_first_egress(session)->egress_list); ++session->negress; spin_unlock(&avm_pa_lock); if (test_bit(PA_S_IN_HW, &session->flags) && pa_hw_pa_get()) { pa_session_handle_stats(session); /* We must be careful that two (or more) egress don't race here * for calling pa_hw_pa_put(), as the session holds at most one reference. * When change_session() is not available, also carefully avoid * calling remove_session() concurrently. */ if (ctx->hardware_pa.change_session) { if ((*ctx->hardware_pa.change_session)(session) != AVM_PA_TX_EGRESS_ADDED) { if (test_and_clear_bit(PA_S_IN_HW, &session->flags)) { pa_hw_pa_put(); } } } else if (test_and_clear_bit(PA_S_IN_HW, &session->flags)) { /* The bit is cleared before calling remove_session() since we couldn't * prevent other hw_pa calls during long-running remove_session() calls. */ (*ctx->hardware_pa.remove_session)(session); pa_hw_pa_put(); } pa_hw_pa_put(); } ctx->stats.tx_egress_ok++; if (ctx->dbgsession) { pa_printk(KERN_DEBUG, "\navm_pa: new egress:\n"); pa_show_session(session, pa_printk, KERN_DEBUG); } info->session_handle = session->session_handle; info->egress_pid_handle = pid_handle; ret = AVM_PA_TX_EGRESS_ADDED; goto out; } no_egress: /* * JZ-56718: flush the entire session and try to allocate * all egress ports with the next set of slow path packets */ pa_session_flush(session, "no egress left"); ctx->stats.tx_egress_error++; ret = AVM_PA_TX_ERROR_EGRESS; goto out; tx_bypass: /* * set TACK priority for TCP control and ack only packets * 2016-10-14 calle */ if (avm_pa_pid_tack_enabled(epid)) { if (tcp_syn || tcp_fin || (tcp_ack && tcp_nodata)) { pkt->priority = pa_calc_tack_priority(info, epid, pkt->priority); epid->prioack_acks++; } } ret = AVM_PA_TX_BYPASS; out: rcu_read_unlock_bh(); return ret; } int _avm_pa_dev_snoop_transmit(struct avm_pa_dev_info *devinfo, struct sk_buff *skb) { if (devinfo->vpid_handle) avm_pa_vpid_snoop_transmit(devinfo->vpid_handle, skb); if (devinfo->pid_handle) return avm_pa_pid_snoop_transmit(devinfo->pid_handle, skb, avm_pa_egresstype_output, 0); return AVM_PA_TX_OK; } EXPORT_SYMBOL(_avm_pa_dev_snoop_transmit); void avm_pa_dev_vpid_snoop_transmit(struct avm_pa_dev_info *devinfo, PKT *pkt) { if (devinfo->vpid_handle) avm_pa_vpid_snoop_transmit(devinfo->vpid_handle, pkt); } EXPORT_SYMBOL(avm_pa_dev_vpid_snoop_transmit); void _avm_pa_add_local_session(struct avm_pa_dev_info *devinfo, struct sk_buff *skb, struct sock *sk) { (void)avm_pa_pid_snoop_transmit(devinfo->pid_handle, skb, avm_pa_egresstype_local, sk); } EXPORT_SYMBOL(_avm_pa_add_local_session); void avm_pa_add_drop_session(struct avm_pa_dev_info *devinfo, struct sk_buff *skb) { struct avm_pa_pkt_info *info = AVM_PKT_INFO(skb); if (info->is_accelerated == 0 && devinfo->pid_handle) (void)avm_pa_pid_snoop_transmit(devinfo->pid_handle, skb, avm_pa_egresstype_null, NULL); } EXPORT_SYMBOL(avm_pa_add_drop_session); void avm_pa_add_xfrm_session(struct avm_pa_dev_info *devinfo, struct sk_buff *skb, struct xfrm_state *x) { if (unlikely(!AVM_PKT_INFO(skb)->is_accelerated) && IS_ENABLED(CONFIG_XFRM)) avm_pa_pid_snoop_transmit(devinfo->pid_handle, skb, avm_pa_egresstype_xfrm, x); } void avm_pa_add_rtp_session(PKT *pkt, struct sock *sk, void (*transmit)(struct sock *sk, PKT *pkt)) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pkt_info *info = AVM_PKT_INFO(pkt); struct avm_pa_session *session; struct avm_pa_egress *egress; spin_lock_bh(&avm_pa_lock); session = pa_session_get(info->session_handle); if (session == 0 || session->negress > 1) goto unlock; egress = avm_pa_first_egress(session); if (egress->type != avm_pa_egresstype_local) { if (egress->type == avm_pa_egresstype_rtp) ctx->stats.rtp_sess_exists++; else ctx->stats.rtp_sess_error++; goto unlock; } set_bit(PA_S_REALTIME, &session->flags); egress->type = avm_pa_egresstype_rtp; egress->rtp.skb_iif = pkt->skb_iif; sock_hold(sk); egress->rtp.sk = sk; egress->rtp.transmit = transmit; ctx->stats.rtp_sess_ok++; unlock: spin_unlock_bh(&avm_pa_lock); } EXPORT_SYMBOL(avm_pa_add_rtp_session); void avm_pa_filter_packet(PKT *pkt) { /* Drop/filter sessions are not implemented yet. */ WARN_ONCE(1, "%s() does nothing. Remove the call!\n", __func__); } EXPORT_SYMBOL(avm_pa_filter_packet); int avm_pa_dev_pidhandle_register_with_ingress(struct avm_pa_dev_info *devinfo, avm_pid_handle pid_handle, struct avm_pa_pid_cfg *cfg, avm_pid_handle ingress_pid_handle) { struct avm_pa_global *ctx = &pa_glob; avm_pid_handle n; /* Already registered */ if (devinfo->pid_handle) { BUG_ON(pid_handle && devinfo->pid_handle != pid_handle); return -EBUSY; } if (ingress_pid_handle) { if (PA_PID(ctx, ingress_pid_handle)->pid_handle != ingress_pid_handle) return -EINVAL; /* ingress pid must be registered beforehand */ } if (pid_handle) { n = pid_handle; goto slot_found; } for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { if (strncmp(cfg->name, PA_PID(ctx, n)->cfg.name, AVM_PA_MAX_NAME) == 0) goto slot_found; } for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { if (PA_PID(ctx, n)->pid_handle || kref_read(&PA_PID(ctx, n)->ref)) continue; else goto slot_found; } return -ENOMEM; slot_found: pa_pid_init(n, cfg); if (ingress_pid_handle) { PA_PID(ctx, n)->ingress_pid_handle = ingress_pid_handle; } else { PA_PID(ctx, n)->ingress_pid_handle = n; } devinfo->pid_handle = n; return 0; } EXPORT_SYMBOL(avm_pa_dev_pidhandle_register_with_ingress); int avm_pa_dev_pidhandle_register(struct avm_pa_dev_info *devinfo, avm_pid_handle pid_handle, struct avm_pa_pid_cfg *cfg) { return avm_pa_dev_pidhandle_register_with_ingress(devinfo, pid_handle, cfg, 0); } EXPORT_SYMBOL(avm_pa_dev_pidhandle_register); int avm_pa_dev_pid_register_with_ingress(struct avm_pa_dev_info *devinfo, struct avm_pa_pid_cfg *cfg, avm_pid_handle ingress_pid_handle) { return avm_pa_dev_pidhandle_register_with_ingress(devinfo, 0, cfg, ingress_pid_handle); } EXPORT_SYMBOL(avm_pa_dev_pid_register_with_ingress); int avm_pa_dev_pid_register(struct avm_pa_dev_info *devinfo, struct avm_pa_pid_cfg *cfg) { return avm_pa_dev_pidhandle_register_with_ingress(devinfo, 0, cfg, 0); } EXPORT_SYMBOL(avm_pa_dev_pid_register); static void pa_dev_queue_xmit(void *arg, struct sk_buff *skb) { int rc; skb->dev = (struct net_device *)arg; rc = dev_queue_xmit(skb); if (dev_xmit_complete(rc) == false && net_ratelimit()) { pr_err("%s(%s): xmit failure: %d\n", __func__, skb->dev->name, rc); } } int avm_pa_dev_register(struct net_device *dev) { struct avm_pa_pid_cfg cfg = {0}; BUG_ON(!dev); if (dev->type == ARPHRD_NONE) cfg.framing = avm_pa_framing_ipdev; else cfg.framing = avm_pa_framing_dev; snprintf(cfg.name, sizeof(cfg.name), "%s", dev->name); cfg.tx_func = pa_dev_queue_xmit; cfg.tx_arg = dev; return avm_pa_dev_pidhandle_register_with_ingress(AVM_PA_DEVINFO(dev), 0, &cfg, 0); } EXPORT_SYMBOL(avm_pa_dev_register); static void pa_dev_start_xmit(void *arg, struct sk_buff *skb) { int rc; struct net_device *dev = arg; struct netdev_queue *txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); bool __maybe_unused again = false; skb->dev = dev; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0) if (!(skb = validate_xmit_skb_list(skb, dev, &again))) { #else if (!(skb = validate_xmit_skb_list(skb, dev))) { #endif if (net_ratelimit()) pr_err("%s(%s): validate_xmit_skb_list() failed\n", __func__, dev->name); return; } #endif HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_tx_queue_stopped(txq)) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) skb = dev_hard_start_xmit(skb, dev, txq, &rc); #else rc = dev_hard_start_xmit(skb, dev, txq); #endif } else { rc = NETDEV_TX_BUSY; } HARD_TX_UNLOCK(dev, txq); if (!dev_xmit_complete(rc)) { if (net_ratelimit()) pr_err("%s(%s): xmit failure: %d\n", __func__, dev->name, rc); kfree_skb_list(skb); } } int avm_pa_dev_register_ll(struct net_device *dev) { struct avm_pa_pid_cfg cfg = {0}; BUG_ON(!dev); /* With NETIF_F_LLTX HARD_TX_LOCK() becomes a no-op instead of an expensive spinlock */ if (!(dev->features & NETIF_F_LLTX)) { pr_warn("%s: Consider NETIF_F_LLTX for device %s for better performance\n", __func__, dev->name); } cfg.framing = avm_pa_framing_ether; snprintf(cfg.name, sizeof(cfg.name), "%s", dev->name); cfg.tx_func = pa_dev_start_xmit; cfg.tx_arg = dev; return avm_pa_dev_pidhandle_register_with_ingress(AVM_PA_DEVINFO(dev), 0, &cfg, 0); } EXPORT_SYMBOL(avm_pa_dev_register_ll); #ifdef CONFIG_AVM_PA_TX_NAPI int avm_pa_dev_pid_register_tx_napi(struct avm_pa_dev_info *devinfo, struct avm_pa_pid_cfg *cfg, struct net_device *dev) { int ret; ret = avm_pa_dev_pidhandle_register_with_ingress(devinfo, 0, cfg, 0); if (!ret) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, devinfo->pid_handle); netif_napi_add(dev, &pid->tx_napi, pa_dev_tx_napi_poll, TX_NAPI_BUDGET); napi_enable(&pid->tx_napi); skb_queue_head_init(&pid->tx_napi_pkts); #ifdef CONFIG_SMP tasklet_init(&pid->tx_napi_tsk, (void *) __do_schedule_napi, (unsigned long) &pid->tx_napi); #endif } return ret; } EXPORT_SYMBOL(avm_pa_dev_pid_register_tx_napi); #endif int avm_pa_pid_set_ecfg(avm_pid_handle pid_handle, struct avm_pa_pid_ecfg *ecfg) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); unsigned int cbsize = sizeof(((struct sk_buff *)0)->cb); if (pid->pid_handle != pid_handle) return -1; memset(&pid->ecfg, 0, sizeof(struct avm_pa_pid_ecfg)); switch (ecfg->version) { case 3: pid->ecfg.pid_group = ecfg->pid_group; case 2: pid->ecfg.rx_slow = ecfg->rx_slow; pid->ecfg.rx_slow_arg = ecfg->rx_slow_arg; case 1: pid->ecfg.cb_start = ecfg->cb_start; pid->ecfg.cb_len = ecfg->cb_len; case 0: pid->ecfg.flags = ecfg->flags; } if (pid->ecfg.cb_start + pid->ecfg.cb_len > cbsize) return -2; pid->ecfg.version = ecfg->version; return 0; } EXPORT_SYMBOL(avm_pa_pid_set_ecfg); int avm_pa_pid_set_framing(avm_pid_handle pid_handle, enum avm_pa_framing ingress_framing, enum avm_pa_framing egress_framing) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if (pid->pid_handle != pid_handle) return -1; switch (ingress_framing) { case avm_pa_framing_llcsnap: case avm_pa_framing_ether: case avm_pa_framing_ppp: case avm_pa_framing_ip: case avm_pa_framing_ipdev: case avm_pa_framing_dev: pid->ingress_framing = ingress_framing; pid->cfg.ptype = 0; break; case avm_pa_framing_ptype: if (pid->ingress_framing != ingress_framing) return -2; pid->cfg.tx_func = 0; pid->cfg.tx_arg = 0; break; } switch (egress_framing) { case avm_pa_framing_llcsnap: case avm_pa_framing_ether: case avm_pa_framing_ppp: case avm_pa_framing_ip: case avm_pa_framing_ipdev: pid->egress_framing = egress_framing; pid->cfg.ptype = 0; break; case avm_pa_framing_dev: pid->egress_framing = avm_pa_framing_ether; pid->cfg.ptype = 0; break; case avm_pa_framing_ptype: return -3; } return 0; } EXPORT_SYMBOL(avm_pa_pid_set_framing); static void pa_show_pids(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; char buf[128]; avm_pid_handle n; unsigned int i; #define INDENT " " for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = PA_PID(ctx, n); struct avm_pa_pid_ecfg *ecfg = &pid->ecfg; int refcount; if (pid->pid_handle == 0) continue; refcount = kref_read(&pid->ref); if (pid->ingress_pid_handle == pid->pid_handle) { (*fprintffunc)(arg, "PID%-3d: (%5d) %-5s %-5s %10lu %s %s (ref %d)\n", pid->pid_handle, pid->cfg.default_mtu, framing2str(pid->ingress_framing), framing2str(pid->egress_framing), (unsigned long)pid->tx_pkts, pid->cfg.name, pidflags2str(ecfg->flags, buf, sizeof(buf)), refcount); } else { (*fprintffunc)(arg, "PID%-3d: (%5d) %-5s %-5s %10lu %s (ingress %d %s) %s (ref %d)\n", pid->pid_handle, pid->cfg.default_mtu, framing2str(pid->ingress_framing), framing2str(pid->egress_framing), (unsigned long)pid->tx_pkts, pid->cfg.name, pid->ingress_pid_handle, PA_PID(ctx, pid->ingress_pid_handle)->cfg.name, pidflags2str(ecfg->flags, buf, sizeof(buf)), refcount); } if (pid->rx_channel_activated || pid->tx_channel_activated) { (*fprintffunc)(arg, INDENT "rx_channel %d tx_channel %d\n", pid->rx_channel_activated ? 1 : 0, pid->tx_channel_activated ? 1 : 0); } if (!pid->bridging_ok) (*fprintffunc)(arg, INDENT "bridging disabled\n"); if (ecfg->pid_group) (*fprintffunc)(arg, INDENT "pid_group %d\n", ecfg->pid_group); if (ecfg->rx_slow) (*fprintffunc)(arg, INDENT "rx_slow %pf\n", ecfg->rx_slow); if (ecfg->cb_start || ecfg->cb_len) (*fprintffunc)(arg, INDENT "cb_start %d cb_len %d\n", ecfg->cb_start, ecfg->cb_len); if (avm_pa_pid_tack_enabled(pid) || avm_pa_pid_tget_enabled(pid)) { for (i = 0; i < AVM_PA_MAX_PRIOS; ++i) { unsigned int tackprio = 0; unsigned int tgetprio = 0; if (avm_pa_pid_tack_enabled(pid)) tackprio = avm_pa_pid_tack_prio(pid, i); if (avm_pa_pid_tget_enabled(pid)) tgetprio = avm_pa_pid_tget_prio(pid, i); if (tackprio || tgetprio) { (*fprintffunc)(arg, INDENT "prio[%u]:", i); if (tackprio) (*fprintffunc)(arg, " tack_prio = 0x%x", tackprio); if (tgetprio) (*fprintffunc)(arg, " tack_prio = 0x%x", tgetprio); (*fprintffunc)(arg, "\n"); } } } } #undef INDENT } int avm_pa_dev_vpidhandle_register(struct avm_pa_dev_info *devinfo, avm_vpid_handle vpid_handle, struct avm_pa_vpid_cfg *cfg) { struct avm_pa_global *ctx = &pa_glob; avm_vpid_handle n; if (devinfo->vpid_handle) { if (vpid_handle && devinfo->vpid_handle != vpid_handle) return 0; n = devinfo->vpid_handle; goto slot_found; } if (vpid_handle) { n = vpid_handle; goto slot_found; } for (n=1; n < CONFIG_AVM_PA_MAX_VPID; n++) { if (strncmp(cfg->name, PA_VPID(ctx, n)->cfg.name, AVM_PA_MAX_NAME) == 0) { goto slot_found; } } for (n=1; n < CONFIG_AVM_PA_MAX_VPID; n++) { if (PA_VPID(ctx, n)->vpid_handle == 0) goto slot_found; } return -1; slot_found: if (cfg->v4_mtu == 0) cfg->v4_mtu = 1500; if (cfg->v6_mtu == 0) cfg->v6_mtu = 1500; memset(PA_VPID(ctx, n), 0, sizeof(*PA_VPID(ctx, n))); PA_VPID(ctx, n)->cfg = *cfg; PA_VPID(ctx, n)->vpid_handle = n; devinfo->vpid_handle = n; return 0; } EXPORT_SYMBOL(avm_pa_dev_vpidhandle_register); int avm_pa_dev_vpid_register(struct avm_pa_dev_info *devinfo, struct avm_pa_vpid_cfg *cfg) { return avm_pa_dev_vpidhandle_register(devinfo, 0, cfg); } EXPORT_SYMBOL(avm_pa_dev_vpid_register); int avm_pa_dev_unregister(struct avm_pa_dev_info *devinfo, struct completion *done) { struct avm_pa_global *ctx = &pa_glob; struct avm_hardware_pa *hwpa; if (devinfo->vpid_handle) { struct avm_pa_vpid *my_vpid = PA_VPID(ctx, devinfo->vpid_handle); avm_vpid_handle my_vpid_handle = my_vpid->vpid_handle; devinfo->vpid_handle = 0; if (my_vpid_handle != 0) { my_vpid->vpid_handle = 0; avm_pa_selector_clear_for_vpid(&ctx->show_filter, my_vpid_handle); avm_pa_flush_sessions_for_vpid(my_vpid_handle); } } if (devinfo->pid_handle) { /* * Unregister so that no new sessions can be created for the pid. * * Deleting the pid may be deferred if there are still sessions alive, * this is handled in the GC (through ref counts). * A reregister is possible until all sessions are gone. * * The ref added at avm_pa_dev_pid_register() is still valid therefore * we don't add another ref here but use PA_PID(). */ struct avm_pa_pid *my_pid = PA_PID(ctx, devinfo->pid_handle); avm_pid_handle my_pid_handle = my_pid->pid_handle; devinfo->pid_handle = 0; if (my_pid_handle != 0) { avm_pid_handle n; /* check if pid is used as ingress pid */ for (n = 1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = PA_PID(ctx, n); if (pid->ingress_pid_handle == my_pid_handle) pid->ingress_pid_handle = pid->pid_handle; } avm_pa_selector_clear_for_pid(&ctx->show_filter, my_pid_handle); /* free virtual channels */ hwpa = &ctx->hardware_pa; my_pid->rx_channel_stopped = 1; if (my_pid->tx_channel_activated) { my_pid->tx_channel_activated = 0; if (hwpa->free_tx_channel) hwpa->free_tx_channel(my_pid_handle); } if (my_pid->rx_channel_activated) { my_pid->rx_channel_activated = 0; if (hwpa->free_rx_channel) hwpa->free_rx_channel(my_pid_handle); } /* At session creation, referencing the PID and moving the session * to ACTIVE state is not fully atomic outside the lock (cannot flush essions * that are in CREATE state). But inside the lock, any sessions in CREATE state * cannot become ACTIVE, because clearing pid->pid_handle prevents new refs. */ spin_lock_bh(&avm_pa_lock); my_pid->pid_handle = 0; my_pid->release_completion = done; if (!pa_pid_put(my_pid_handle)) { avm_pa_flush_sessions_for_pid(my_pid_handle); } spin_unlock_bh(&avm_pa_lock); return 0; } } return -ENODEV; } EXPORT_SYMBOL(avm_pa_dev_unregister); int avm_pa_dev_unregister_sync(struct avm_pa_dev_info *devinfo) { int ret; int my_pid_handle = devinfo->pid_handle; struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, my_pid_handle); DECLARE_COMPLETION_ONSTACK(done); might_sleep(); ret = avm_pa_dev_unregister(devinfo, &done); if (ret != 0) return ret; /* Normally there is no way to block indefinitely, but mark killable * in case of a bug somewhere. */ ret = wait_for_completion_killable_timeout(&done, HZ * 10); if (ret == 0) { /* Timeout. This is fatal. Maybe some session hangs? */ int i; pr_crit("FATAL in %s!\n PID %s is not removed. Refcount: %d.\n Done: %p vs %p", __func__, pid->cfg.name, kref_read(&pid->ref), &done, pid->release_completion /* should be NULL */); for (i = 1; i < CONFIG_AVM_PA_MAX_SESSION; i++) { struct avm_pa_session *s = PA_SESSION(&pa_data, i); void *hw = avm_pa_get_hw_session(s); /* hw != NULL is the most probable reason for session removal to hang */ if (s->on_list != AVM_PA_LIST_FREE && hw) { pa_show_session(s, pa_printk, KERN_CRIT); pr_crit("hw_session : %p\n\n", hw); break; /* print at most one session to not overflow printk buffer */ } } pr_crit("last tick : %ld\n", (long)(jiffies - last_tick)); pr_crit("next tick : %ld\n", (long)(pa_glob.tick_timer.expires - jiffies)); BUG(); } else if (ret < 0) { /* Apparently we can get here during reboot. We continue without completing. * The PID is not fully unregistered yet but we have an additional * safe guard at registration to prevent double registration. */ pr_err("avm_pa: %s: interrupted prematurely\n", __func__); pid->release_completion = NULL; return ret; } else { return 0; /* completed! */ } } EXPORT_SYMBOL(avm_pa_dev_unregister_sync); /* ------------------------------------------------------------------------ */ /* -------- pid extra functions ------------------------------------------- */ /* ------------------------------------------------------------------------ */ bool avm_pa_pid_set_bridging(avm_pid_handle pid_handle, bool bridging_ok) { struct avm_pa_pid *pid = pa_pid_get_pid(pid_handle); struct avm_pa_pid *ingress_pid; bool ret; if (!pid) { pr_err("%s: pid %u not registered\n", __func__, pid_handle); return false; } /* Usually the same pid (pid->ingress_pid_handle == pid->pid_handle) */ ingress_pid = pa_pid_get_pid(pid->ingress_pid_handle); if (!ingress_pid) { pr_err("%s: ingress pid %u not registered\n", __func__, pid->ingress_pid_handle); pa_pid_put(pid_handle); return false; } /* Set on the ingress pid as well. We don't want the ingress_pid to create * bridging sessions if we disallow it here. The ingress_pid machanism is an * optimization (for larger bypass and to allow for HW offloading) and shouldn't * create different types of session. */ ret = pid->bridging_ok; pid->bridging_ok = bridging_ok; ingress_pid->bridging_ok = bridging_ok; pa_pid_put(ingress_pid->pid_handle); pa_pid_put(pid_handle); return ret; } EXPORT_SYMBOL(avm_pa_pid_set_bridging); int avm_pa_pid_set_hwinfo(avm_pid_handle pid_handle, struct avm_pa_pid_hwinfo *hw) { return avm_pa_pid_set_hwinfo2(pid_handle, hw, sizeof(*hw), GFP_ATOMIC); } EXPORT_SYMBOL(avm_pa_pid_set_hwinfo); int avm_pa_pid_set_hwinfo2(avm_pid_handle pid_handle, struct avm_pa_pid_hwinfo *hw, size_t sizeof_hwinfo, gfp_t gfp) { struct avm_pa_pid *pid = pa_pid_get_pid(pid_handle); /* Perhaps the module calling this must be recompiled */ BUG_ON(hw && sizeof_hwinfo < sizeof(struct avm_pa_pid_hwinfo)); BUG_ON(hw == NULL && sizeof_hwinfo != 0); if (!pid) { pr_err("avm_pa_pid_set_hwinfo: pid %u not registered\n", pid_handle); return -ENODEV; } kfree(pid->hw); pid->hw = NULL; if (hw) { pid->hw = kmemdup(hw, sizeof_hwinfo, gfp); if (!pid->hw) { pr_err("avm_pa_pid_set_hwinfo: kmalloc failed\n"); pa_pid_put(pid_handle); return -ENOMEM; } } pa_pid_put(pid_handle); return 0; } EXPORT_SYMBOL(avm_pa_pid_set_hwinfo2); struct avm_pa_pid_hwinfo * avm_pa_pid_get_hwinfo(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); /* no need to ref for read access */ /* Allow to be called with pid_handle == 0, for convinience. */ if (pid->pid_handle != pid_handle) { if (net_ratelimit()) pr_err("avm_pa_pid_get_hwinfo: pid %u not registered\n", pid_handle); return NULL; } return pid->hw; } EXPORT_SYMBOL(avm_pa_pid_get_hwinfo); int avm_pa_pid_activate_hw_accelaration(avm_pid_handle pid_handle) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = pa_pid_get_pid(pid_handle); struct avm_hardware_pa *hwpa; if (!pid) { pr_err("avm_pa_pid_activate_hw_accelaration: pid %u not registered\n", pid_handle); return -1; } pr_info("avm_pa: try to activate hw accelaration for pid %u (%s) called from %pf\n", pid_handle, pid->cfg.name, (void *)_RET_IP_); hwpa = &ctx->hardware_pa; if ( pid->rx_channel_activated == 0 && pid->ingress_framing == avm_pa_framing_ether && hwpa->alloc_rx_channel) { if ((*hwpa->alloc_rx_channel)(pid_handle) < 0) { pr_err("avm_pa: can't activate rx channel, pid %u (%s)\n", pid_handle, pid->cfg.name); } else { pid->rx_channel_stopped = 0; pid->rx_channel_activated = 1; pr_info("avm_pa: rx channel activated, pid %u (%s)\n", pid_handle, pid->cfg.name); } } if ( pid->tx_channel_activated == 0 && ( pid->egress_framing == avm_pa_framing_ether || pid->egress_framing == avm_pa_framing_ptype) && hwpa->alloc_tx_channel) { if ((*hwpa->alloc_tx_channel)(pid_handle) < 0) { pr_err("avm_pa: can't activate tx channel, pid %u (%s)\n", pid_handle, pid->cfg.name); } else { pid->tx_channel_activated = 1; pr_info("avm_pa: tx channel activated, pid %u (%s)\n", pid_handle, pid->cfg.name); } } pa_pid_put(pid_handle); return 0; } EXPORT_SYMBOL(avm_pa_pid_activate_hw_accelaration); /* Enables or disables a priority map */ int avm_pa_pid_prio_map_enable(avm_pid_handle pid_handle, unsigned short prio_map, int enable) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if (pid_handle != pid->pid_handle) { pr_err("%s: pid handle %u not registered\n", __FUNCTION__, pid_handle); return -1; } if (prio_map >= AVM_PA_COUNT_PRIO_MAPS) { pr_err("%s: prio map %hu does not exist\n", __FUNCTION__, prio_map); return -2; } pid->prio_maps[prio_map].enabled = enable ? 1 : 0; return 0; } EXPORT_SYMBOL(avm_pa_pid_prio_map_enable); /* Resets a priority map */ int avm_pa_pid_prio_map_reset(avm_pid_handle pid_handle, unsigned short prio_map) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if (pid_handle != pid->pid_handle) { pr_err("%s: pid handle %u not registered\n", __FUNCTION__, pid_handle); return -1; } if (prio_map >= AVM_PA_COUNT_PRIO_MAPS) { pr_err("%s: prio map %hu does not exist\n", __FUNCTION__, prio_map); return -2; } memset(pid->prio_maps[prio_map].prios, 0, sizeof(pid->prio_maps[prio_map].prios)); return 0; } EXPORT_SYMBOL(avm_pa_pid_prio_map_reset); /* Sets the priority per queue */ int avm_pa_pid_prio_map_set_prio_per_queue(avm_pid_handle pid_handle, unsigned short prio_map, unsigned int queue, unsigned int prio) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if (pid_handle != pid->pid_handle) { pr_err("%s: pid handle %u not registered\n", __FUNCTION__, pid_handle); return -1; } if (prio_map >= AVM_PA_COUNT_PRIO_MAPS) { pr_err("%s: prio map %hu does not exist\n", __FUNCTION__, prio_map); return -2; } if (queue >= AVM_PA_MAX_PRIOS) { pr_err("%s: prio map %hu queue %u out of bounds\n", __FUNCTION__, prio_map, queue); return -3; } /* A value of 0 for the prio parameter will render the underlying priority * unspecified. An unspecified priority will not be used for setting any * skb priority. */ pid->prio_maps[prio_map].prios[queue] = prio; return 0; } EXPORT_SYMBOL(avm_pa_pid_prio_map_set_prio_per_queue); int avm_pa_pid_activate_tcpackprio(avm_pid_handle pid_handle, int enable, unsigned int prio) { /* Enable / disable the tack priority map to retain backwards compatibility with the old prioack procfs interface */ if (avm_pa_pid_prio_map_enable(pid_handle, AVM_PA_PRIO_MAP_TACK, enable)) { return -1; } return avm_pa_pid_prio_map_set_prio_per_queue(pid_handle, AVM_PA_PRIO_MAP_TACK, AVM_PA_BE_QUEUE, enable ? prio : 0); } EXPORT_SYMBOL(avm_pa_pid_activate_tcpackprio); int avm_pa_pid_activate_tgetprio(avm_pid_handle pid_handle, int enable, unsigned int prio) { /* Enable / disable the tget priority map to retain backwards compatibility with the old prioack procfs interface */ if (avm_pa_pid_prio_map_enable(pid_handle, AVM_PA_PRIO_MAP_TGET, enable)) { return -1; } return avm_pa_pid_prio_map_set_prio_per_queue(pid_handle, AVM_PA_PRIO_MAP_TGET, AVM_PA_BE_QUEUE, enable ? prio : 0); } EXPORT_SYMBOL(avm_pa_pid_activate_tgetprio); int avm_pa_register_hardware_pa(struct avm_hardware_pa *pa_functions) { struct avm_pa_global *ctx = &pa_glob; if (!pa_functions || (pa_functions->flags & ~AVM_HW_F_ALL)) return -EINVAL; if (!pa_hw_pa_valid(pa_functions)) return -EINVAL; if (pa_hw_pa_valid(&ctx->hardware_pa)) { pr_err("avm_pa: hardware_pa already registered\n"); return -EADDRINUSE; } if (pa_hw_pa_get()) { /* We can only get here if an avm_pa_unregister_hardware_pa() call * didn't complete yet. */ pr_err("avm_pa: deregistration pending\n"); pa_hw_pa_put(); return -EAGAIN; } kref_init(&ctx->hw_pa_ref); ctx->hardware_pa = *pa_functions; if (pa_functions->alloc_tx_channel || pa_functions->alloc_rx_channel) { avm_pid_handle n; for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = PA_PID(ctx, n); if ( pid->pid_handle == n && pid->egress_framing == avm_pa_framing_ptype) { avm_pa_pid_activate_hw_accelaration(n); } } } return 0; } EXPORT_SYMBOL(avm_pa_register_hardware_pa); int avm_pa_unregister_hardware_pa(struct avm_hardware_pa *pa_functions, struct completion *done) { struct avm_pa_global *ctx = &pa_glob; struct avm_hardware_pa *hwpa = &ctx->hardware_pa; int n; if (!pa_functions) return -ENODEV; BUG_ON(hwpa->add_session != pa_functions->add_session); BUG_ON(hwpa->add_session_skb != pa_functions->add_session_skb); /* Stop adding hw sessions. the read side might still have a cached pointer * and add sessions but this is OK since they hold a ref on the hw_pa * and we're not doing the complete(done) here. */ rcu_assign_pointer(hwpa->add_session, NULL); rcu_assign_pointer(hwpa->add_session_skb, NULL); for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = pa_pid_get_pid(n); if (pid) { pid->rx_channel_stopped = 1; if (pid->tx_channel_activated) { pid->tx_channel_activated = 0; if (hwpa->free_tx_channel) hwpa->free_tx_channel(n); } if (pid->rx_channel_activated) { pid->rx_channel_activated = 0; if (hwpa->free_rx_channel) hwpa->free_rx_channel(n); } avm_pa_pid_put(n); } } ctx->hw_pa_flush_completion = done; if (!pa_hw_pa_put()) { /* kill all sessions in hw pa if necessary. pa_hw_pa_put returns 0 * if any session is in_hw (has a reference). */ avm_pa_flush_hw_sessions(); } return 0; } EXPORT_SYMBOL(avm_pa_unregister_hardware_pa); int avm_pa_unregister_hardware_pa_sync(struct avm_hardware_pa *pa_functions) { DECLARE_COMPLETION_ONSTACK(done); int ret; struct avm_pa_global *ctx = &pa_glob; might_sleep(); ret = avm_pa_unregister_hardware_pa(pa_functions, &done); /* Normally there is no way to block indefinitely, but mark killable * in case of a bug somewhere. */ if (ret == 0 && wait_for_completion_killable(&done)) { pr_err("avm_pa: %s: interrupted prematurely\n", __func__); /* If we get here we must continue before completing. * The hardware_pa is not fully unregistered yet but there are additional * safe guards at registration to prevent double registration. */ ctx->hw_pa_flush_completion = NULL; } return ret; } EXPORT_SYMBOL(avm_pa_unregister_hardware_pa_sync); int avm_pa_is_hardware_pa_active(void) { struct avm_pa_global *ctx = &pa_glob; return pa_hw_pa_valid(&ctx->hardware_pa) && !ctx->hw_ppa_disabled; } EXPORT_SYMBOL(avm_pa_is_hardware_pa_active); /* ------------------------------------------------------------------------ */ static void pa_show_brief_status_header(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; const char *mode; if (ctx->disabled) mode = "disabled"; else if (ctx->fw_disabled) mode = "testmode"; else if (avm_pa_capture_running()) mode = "capture"; else mode = "enabled"; (*fprintffunc)(arg, "State : %s\n", mode); if (pa_hw_pa_valid(&ctx->hardware_pa)) { mode = ctx->hw_ppa_disabled ? "disabled" : "enable"; (*fprintffunc)(arg, "HW State : %s\n", mode); } #ifdef CONFIG_BLOG else { mode = !blog_fc_enabled() ? "disabled (flow cache)" : "enable (flow cache)"; (*fprintffunc)(arg, "HW State : %s\n", mode); } #endif if ((pa_hw_pa_valid(&ctx->hardware_pa) && (ctx->hardware_pa.flags & AVM_HW_F_NO_BSESSION))) mode = "no (by hw)"; else if (!ctx->bsession_allowed) mode = "no"; else mode = "yes"; (*fprintffunc)(arg, "BSession allow : %s\n", mode); if (ctx->filter_enabled && list_empty(&ctx->accel_filter)) mode = "empty"; else if (ctx->filter_enabled) mode = "yes"; else mode = "no"; (*fprintffunc)(arg, "Filter active : %s\n", mode); } static void pa_show_num_sessions(pa_fprintf fprintffunc, void *arg, int right_align) { struct avm_pa_global *ctx = &pa_glob; (*fprintffunc)(arg, "BSessions : %*u\n", right_align, (unsigned)ctx->stats.nbsessions); (*fprintffunc)(arg, "Sessions : %*hu\n", right_align, ctx->sess_list[AVM_PA_LIST_ACTIVE].nsessions); (*fprintffunc)(arg, "Drop Sessions : %*hu\n", right_align, ctx->stats.drop_sess_ok - ctx->stats.drop_sess_del); /* There is a ref for every in_hw session plus one that's taken on registration */ (*fprintffunc)(arg, "HW Sessions : %*d\n", right_align, kref_read(&ctx->hw_pa_ref) - (ctx->hardware_pa.add_session ? 1 : 0)); (*fprintffunc)(arg, "Max Sessions : %*hu\n", right_align, ctx->sess_list[AVM_PA_LIST_ACTIVE].maxsessions); (*fprintffunc)(arg, "Sessions (dead): %*hu\n", right_align, ctx->sess_list[AVM_PA_LIST_DEAD].nsessions); (*fprintffunc)(arg, "Sessions (free): %*hu\n", right_align, ctx->sess_list[AVM_PA_LIST_FREE].nsessions); } static void pa_show_linux_banner(pa_fprintf fprintffunc, void *arg) { struct new_utsname *uts; uts = utsname(); BUG_ON(!uts); /* cp. fs/proc/version.c, v2.6.27..v4.16+: */ (*fprintffunc)(arg, linux_proc_banner, uts->sysname, uts->release, uts->version); } static void pa_show_brief(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; unsigned free_egress = 0; struct hlist_node *node; (*fprintffunc)(arg, "Version : " AVM_PA_VERSION " on "); pa_show_linux_banner(fprintffunc, arg); pa_show_brief_status_header(fprintffunc, arg); pa_show_num_sessions(fprintffunc, arg, 0); hlist_for_each(node, &ctx->egress_freelist) ++free_egress; (*fprintffunc)(arg, "Egress pool : %u/%zd\n", free_egress, ARRAY_SIZE(pa_data.egress_pool)); (*fprintffunc)(arg, "Rx pkts/secs : %lu\n", (unsigned long)ctx->stats.rx_pps); (*fprintffunc)(arg, "Fw pkts/sec : %lu\n", (unsigned long)ctx->stats.fw_pps); (*fprintffunc)(arg, "Ov pkts/sec : %lu\n", (unsigned long)ctx->stats.overlimit_pps); (*fprintffunc)(arg, "Rx pakets : %lu\n", (unsigned long)ctx->stats.rx_pkts); (*fprintffunc)(arg, "Rx bypass : %lu\n", (unsigned long)ctx->stats.rx_bypass); (*fprintffunc)(arg, "Rx ttl <= 1 : %lu\n", (unsigned long)ctx->stats.rx_ttl); (*fprintffunc)(arg, "Rx broadcast : %lu\n", (unsigned long)ctx->stats.rx_broadcast); (*fprintffunc)(arg, "Rx search : %lu\n", (unsigned long)ctx->stats.rx_search); (*fprintffunc)(arg, "Rx match : %lu\n", (unsigned long)ctx->stats.rx_match); (*fprintffunc)(arg, "Rx modified : %lu\n", (unsigned long)ctx->stats.rx_mod); (*fprintffunc)(arg, "Fw pakets : %lu\n", (unsigned long)ctx->stats.fw_pkts); (*fprintffunc)(arg, "Fw local : %lu\n", (unsigned long)ctx->stats.fw_local); } static void pa_show_memory(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; pa_show_brief_status_header(fprintffunc, arg); #define FMT "%4zd.%02zd KB" #define ARG(x) (x)/1000, ((x)%1000)/10 (*fprintffunc)(arg, "avm_pa_global : " FMT "\n", ARG(sizeof(struct avm_pa_global))); (*fprintffunc)(arg, "avm_pa_data : " FMT "\n", ARG(sizeof(struct avm_pa_data))); (*fprintffunc)(arg, "global + data : " FMT "\n", ARG(sizeof(struct avm_pa_global) + sizeof(struct avm_pa_data))); (*fprintffunc)(arg, "One session : " FMT "\n", ARG(sizeof(struct avm_pa_session))); (*fprintffunc)(arg, "All sessions : " FMT "\n", ARG(sizeof(pd->sessions))); (*fprintffunc)(arg, "One bsession : " FMT "\n", ARG(sizeof(struct avm_pa_bsession))); (*fprintffunc)(arg, "All bsessions : " FMT "\n", ARG(sizeof(ctx->bsess_array))); (*fprintffunc)(arg, "One ingress : " FMT "\n", ARG(sizeof(struct avm_pa_pkt_match))); (*fprintffunc)(arg, "One egress : " FMT "\n", ARG(sizeof(struct avm_pa_egress))); (*fprintffunc)(arg, "Egress pool : " FMT "\n", ARG(sizeof(pd->egress_pool))); (*fprintffunc)(arg, "One macaddr : " FMT "\n", ARG(sizeof(struct avm_pa_macaddr))); (*fprintffunc)(arg, "All macaddrs : " FMT "\n", ARG(sizeof(ctx->macaddr_array))); (*fprintffunc)(arg, "One pid : " FMT "\n", ARG(sizeof(struct avm_pa_pid))); (*fprintffunc)(arg, "All pids : " FMT "\n", ARG(sizeof(ctx->pid_array))); (*fprintffunc)(arg, "One vpid : " FMT "\n", ARG(sizeof(struct avm_pa_vpid))); (*fprintffunc)(arg, "All vpids : " FMT "\n", ARG(sizeof(ctx->vpid_array))); (*fprintffunc)(arg, "Stats : " FMT "\n", ARG(sizeof(struct avm_pa_stats))); (*fprintffunc)(arg, "TOK Stats : " FMT "\n", ARG(ctx->tok_end - ctx->tok_start)); (*fprintffunc)(arg, "Estimator data : " FMT "\n", ARG(ctx->est_end - ctx->est_start)); } static void pa_show_stats(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; pa_show_num_sessions(fprintffunc, arg, 9); #define PRINT_STAT(t, member) do { \ (*fprintffunc)(arg, "%-15s: %9lu (%+7ld)\n", t, \ (unsigned long)ctx->stats. member, \ (long)(ctx->stats. member - ctx->stats_copy. member)) ; \ } while(0) PRINT_STAT("Rx packets/sec", rx_pps); PRINT_STAT("Fw packets/sec", fw_pps); PRINT_STAT("Ov packets/sec", overlimit_pps); PRINT_STAT("Rxfw packets", rxfw_pkts); PRINT_STAT("Rxfw bypass", rxfw_bypass); PRINT_STAT("Rx pakets", rx_pkts); PRINT_STAT("Rx bypass", rx_bypass); PRINT_STAT("Rx frag list", rx_frag_list); PRINT_STAT("Rx ttl <= 1", rx_ttl); PRINT_STAT("Rx broadcast", rx_broadcast); PRINT_STAT("Rx search", rx_search); PRINT_STAT("Rx match", rx_match); PRINT_STAT("Rx lisp changed", rx_lispchanged); PRINT_STAT("Rx df", rx_df); PRINT_STAT("Rx modified", rx_mod); PRINT_STAT("Rx overlimit", rx_overlimit); PRINT_STAT("Rx dropped", rx_dropped); PRINT_STAT("Rx irq", rx_irq); PRINT_STAT("Rx irq dropped", rx_irqdropped); PRINT_STAT("Rx size", rx_too_small); PRINT_STAT("Rx hroom", rx_headroom_too_small); PRINT_STAT("Fw pakets", fw_pkts); PRINT_STAT("Fw output", fw_output); PRINT_STAT("Fw output drop", fw_output_drop); PRINT_STAT("Fw local", fw_local); PRINT_STAT("Fw rtp", fw_rtp); PRINT_STAT("Fw rtp drop", fw_rtp_drop); PRINT_STAT("Fw illegal", fw_ill); PRINT_STAT("Fw frags", fw_frags); PRINT_STAT("Fw drop", fw_drop); PRINT_STAT("Fw drop gone", fw_drop_gone); PRINT_STAT("Fw fail", fw_fail); PRINT_STAT("Fw frag fail", fw_frag_fail); PRINT_STAT("Tx local", tx_local); PRINT_STAT("Tx loopback", tx_loopback); PRINT_STAT("Tx already", tx_already); PRINT_STAT("Tx bypass", tx_bypass); PRINT_STAT("Tx sess error", tx_sess_error); PRINT_STAT("Tx sess ok", tx_sess_ok); PRINT_STAT("Tx sess exists", tx_sess_exists); PRINT_STAT("Tx egress error", tx_egress_error); PRINT_STAT("Tx egress ok", tx_egress_ok); PRINT_STAT("Tx pid change", tx_pid_change); PRINT_STAT("Loc sess error", local_sess_error); PRINT_STAT("Loc sess ok", local_sess_ok); PRINT_STAT("Loc sess exists", local_sess_exists); PRINT_STAT("DROP sess ok", drop_sess_ok); PRINT_STAT("XFRM sess ok", xfrm_sess_ok); PRINT_STAT("RTP sess error", rtp_sess_error); PRINT_STAT("RTP sess ok", rtp_sess_ok); PRINT_STAT("RTP sess exists", rtp_sess_exists); PRINT_STAT("TBF schedule", tbf_schedule); PRINT_STAT("TBF reschedule", tbf_reschedule); #ifdef CONFIG_AVM_PA_RPS { int i; for (i = 0; i < CONFIG_AVM_PA_RPS_QUEUES; i++) { (*fprintffunc)(arg, "RPS enqueue %2d : %9lu\n", i, ctx->rps[i].rx_enqueued); (*fprintffunc)(arg, "RPS ipis %2d : %9lu\n", i, ctx->rps[i].rx_rps_ipis); (*fprintffunc)(arg, "RPS dequeue %2d : %9lu\n", i, ctx->rps[i].rx_dequeued); (*fprintffunc)(arg, "RPS rv size %2d : %9zu\n", i, ctx->rps[i].r_sz * sizeof(struct sk_buff *)); } } #endif PRINT_STAT("sess flushed", sess_flushed); PRINT_STAT("sess timedout", sess_timedout); PRINT_STAT("sess pid change", sess_pidchanged); PRINT_STAT("rxch no rx slow", rx_channel_no_rx_slow); PRINT_STAT("rxch stopped", rx_channel_stopped); PRINT_STAT("txch dropped", tx_channel_dropped); ctx->stats_copy = ctx->stats; }; static void pa_show_status(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; pa_show_brief_status_header(fprintffunc, arg); (*fprintffunc)(arg, "Current Rate : %lu\n", (unsigned long)ctx->stats.fw_pps); } static void pa_show_vpids(pa_fprintf fprintffunc, void *arg) { struct avm_pa_global *ctx = &pa_glob; avm_vpid_handle n; for (n=1; n < CONFIG_AVM_PA_MAX_VPID; n++) { struct avm_pa_vpid *vpid = PA_VPID(ctx, n); if (vpid->vpid_handle == 0) continue; (*fprintffunc)(arg, "VPID%-2d: %4d/%4d %s\n", vpid->vpid_handle, vpid->cfg.v4_mtu, vpid->cfg.v6_mtu, vpid->cfg.name); } } void avm_pa_dev_set_ipv4_mtu(struct avm_pa_dev_info *devinfo, u16 mtu) { if (devinfo->vpid_handle) { struct avm_pa_global *ctx = &pa_glob; PA_VPID(ctx, devinfo->vpid_handle)->cfg.v4_mtu = mtu; } } EXPORT_SYMBOL(avm_pa_dev_set_ipv4_mtu); void avm_pa_dev_set_ipv6_mtu(struct avm_pa_dev_info *devinfo, u16 mtu) { if (devinfo->vpid_handle) { struct avm_pa_global *ctx = &pa_glob; PA_VPID(ctx, devinfo->vpid_handle)->cfg.v6_mtu = mtu; } } EXPORT_SYMBOL(avm_pa_dev_set_ipv6_mtu); static void pa_flush_sessions_selective(bool (*match_session)(struct avm_pa_session *sess, va_list args), const char *reason, ...) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_session *session; struct avm_pa_session_list *list = &ctx->sess_list[AVM_PA_LIST_ACTIVE]; unsigned count = 0; va_list ap; /* There is a linker error on mips if the nested functions access stack variables * of the outer functions. So we pass them va variadic args. Change to static * functions if this also breaks down */ va_start(ap, reason); /* We are potentially called from process context. Make sure this is called * rarely in softirq, try to use pa_session_flush() directly. */ rcu_read_lock_bh(); list_for_each_entry_rcu(session, &list->sessions, session_list) { if (match_session(session, ap)) { pa_session_flush(session, reason); count += 1; } } rcu_read_unlock_bh(); va_end(ap); ctx->stats.sess_flushed += count; } void avm_pa_flush_sessions(void) { bool fn(struct avm_pa_session *session, va_list args) { return true; } pa_flush_sessions_selective(fn, "flush"); } EXPORT_SYMBOL(avm_pa_flush_sessions); void avm_pa_flush_sessions_select(const char *selector, gfp_t gfp) { LIST_HEAD(selector_list); int ret; bool fn(struct avm_pa_session *session, va_list args) { struct list_head *selector_list = va_arg(args, struct list_head *); return avm_pa_session_is_selected(selector_list, session); } ret = avm_pa_parse_selector(&selector_list, selector, gfp); if (WARN_ON(ret != 0)) return; pa_flush_sessions_selective(fn, "select", &selector_list); avm_pa_selector_free(&selector_list); } EXPORT_SYMBOL(avm_pa_flush_sessions_select); static void avm_pa_flush_bsessions(void) { bool fn(struct avm_pa_session *session, va_list args) { return session->bsession != NULL; } pa_flush_sessions_selective(fn, "bsession flush"); } static void avm_pa_flush_hw_sessions(void) { bool fn(struct avm_pa_session *session, va_list args) { return test_bit(PA_S_IN_HW, &session->flags); } pa_flush_sessions_selective(fn, "hw flush"); } void avm_pa_flush_lispencap_sessions(void) { bool fn(struct avm_pa_session *session, va_list args) { return session->mod.pkttype & AVM_PA_PKTTYPE_LISP; } pa_flush_sessions_selective(fn, "lispencap flush"); } EXPORT_SYMBOL(avm_pa_flush_lispencap_sessions); void avm_pa_flush_rtp_session(struct sock *sk) { bool fn(struct avm_pa_session *session, va_list args) { struct avm_pa_egress *egress = avm_pa_first_egress(session); return egress->type == avm_pa_egresstype_rtp && egress->rtp.sk == va_arg(args, struct sock *); } pa_flush_sessions_selective(fn, "rtp flush", sk); } EXPORT_SYMBOL(avm_pa_flush_rtp_session); void avm_pa_flush_multicast_sessions(void) { bool fn(struct avm_pa_session *session, va_list args) { return session->ingress.casttype == AVM_PA_IS_MULTICAST; } pa_flush_sessions_selective(fn, "multicast flush"); } EXPORT_SYMBOL(avm_pa_flush_multicast_sessions); void avm_pa_flush_multicast_sessions_for_group(u32 group) { bool fn(struct avm_pa_session *session, va_list args) { if (session->ingress.casttype == AVM_PA_IS_MULTICAST) { int i; for (i = 0; i < session->ingress.nmatch; i++) { struct avm_pa_match_info *p = &session->ingress.match[i]; if (p->type == AVM_PA_IPV4) { hdrunion_t *hdr = (hdrunion_t *)&session->ingress.hdrcopy[p->offset + session->ingress.hdroff]; if (va_arg(args, u32) == hdr->iph.daddr) return true; } } } return false; } pa_flush_sessions_selective(fn, "multicast flush", group); } EXPORT_SYMBOL(avm_pa_flush_multicast_sessions_for_group); void avm_pa_flush_sessions_for_vpid(avm_vpid_handle vpid_handle) { bool fn(struct avm_pa_session *session, va_list args) { avm_vpid_handle vpid_handle = (avm_vpid_handle) va_arg(args, int); if (session->ingress_vpid_handle == vpid_handle) { return true; } else { struct avm_pa_egress *egress; avm_pa_for_each_egress(egress, session) { if (egress->vpid_handle == vpid_handle) return true; } return false; } } pa_flush_sessions_selective(fn, "vpid flush", (int) vpid_handle); } EXPORT_SYMBOL(avm_pa_flush_sessions_for_vpid); void avm_pa_flush_sessions_for_pid(avm_pid_handle pid_handle) { bool fn(struct avm_pa_session *session, va_list args) { avm_vpid_handle pid_handle = (avm_pid_handle) va_arg(args, int); if (session->ingress_pid_handle == pid_handle) { return true; } else { struct avm_pa_egress *egress; avm_pa_for_each_egress(egress, session) { if (egress->pid_handle == pid_handle) return true; } return false; } } pa_flush_sessions_selective(fn, "pid flush", (int) pid_handle); } EXPORT_SYMBOL(avm_pa_flush_sessions_for_pid); void avm_pa_flush_sessions_for_sg(unsigned short groupid) { bool fn(struct avm_pa_session *session, va_list args) { unsigned short groupid = (unsigned short) va_arg(args, int); return avm_pa_session_belongs_to_sg(session, groupid) != 0; } pa_flush_sessions_selective(fn, "group flush", (int) groupid); } EXPORT_SYMBOL(avm_pa_flush_sessions_for_sg); static void avm_pa_flush_sessions_with_destmac(struct avm_pa_macaddr *destmac) { bool fn(struct avm_pa_session *session, va_list args) { struct avm_pa_egress *egress; avm_pa_for_each_egress(egress, session) { if (egress->destmac == va_arg(args, struct avm_pa_macaddr *)) { return true; } } return false; } pa_flush_sessions_selective(fn, "destmac", destmac); } void avm_pa_flush_sessions_for_mac(const unsigned char mac[ETH_ALEN]) { bool fn(struct avm_pa_session *session, va_list args) { struct avm_pa_egress *egress; struct ethhdr *ethh; u8 *mac = (u8 *) va_arg(args, u8 *); struct avm_pa_pkt_match *match = &session->ingress; struct avm_pa_match_info *info = &match->match[0]; if (info->type == AVM_PA_ETH) { /* We want to flush all sessions that belong to the mac address. * That means we check both source and dest of the ingress. * * We also have to check each egress: * 1) In typical NAT scenarios both source and dest change * 2) In typical MAT scenarios the source changes */ ethh = (struct ethhdr *) (HDRCOPY(match) + info->offset); if (ether_addr_equal(mac, ethh->h_source)) return true; if (ether_addr_equal(mac, ethh->h_dest)) return true; } if (session->bsession) /* egress doesn't change for bsessions */ return false; avm_pa_for_each_egress(egress, session) { match = &egress->match; info = &match->match[0]; if (egress->destmac && info->type == AVM_PA_ETH) { ethh = (struct ethhdr *) (HDRCOPY(match) + info->offset); if (ether_addr_equal(mac, ethh->h_source)) return true; if (ether_addr_equal(mac, ethh->h_dest)) return true; } } return false; } /* enforce alignment required by ether_addr_equal() */ u8 mac_tmp[ETH_ALEN] __aligned(2); memcpy(mac_tmp, mac, ETH_ALEN); pa_flush_sessions_selective(fn, "mac flush", &mac_tmp); } EXPORT_SYMBOL(avm_pa_flush_sessions_for_mac); void avm_pa_telefon_state(int state) { pr_info("avm_pa: avm_pa_telefon_state\n"); } EXPORT_SYMBOL(avm_pa_telefon_state); /* ------------------------------------------------------------------------ */ /* ------- packet rate estimater ------------------------------------------ */ /* ------------------------------------------------------------------------ */ #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) static void avm_pa_est_timer(unsigned long data) #else static void avm_pa_est_timer(struct timer_list *timer) #endif { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_est *e; u32 npackets; u32 rate; /* fw pkts/s */ e = &ctx->fw_est; npackets = ctx->stats.fw_pkts; if (npackets >= e->last_packets) { rate = (npackets - e->last_packets)<<(12 - ctx->est_idx); e->last_packets = npackets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); ctx->stats.fw_pps = (e->avpps+0x1FF)>>10; } else { e->last_packets = npackets; } /* rx pkts/s */ e = &ctx->rx_est; npackets = ctx->stats.rx_pkts; if (npackets >= e->last_packets) { rate = (npackets - e->last_packets)<<(12 - ctx->est_idx); e->last_packets = npackets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); ctx->stats.rx_pps = (e->avpps+0x1FF)>>10; } else { e->last_packets = npackets; } /* queued pkts/s */ e = &ctx->overlimit_est; npackets = ctx->stats.rx_overlimit; if (npackets >= e->last_packets) { rate = (npackets - e->last_packets)<<(12 - ctx->est_idx); e->last_packets = npackets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); ctx->stats.overlimit_pps = (e->avpps+0x1FF)>>10; } else { e->last_packets = npackets; } mod_timer(&ctx->est_timer, jiffies + ((HZ/4) << ctx->est_idx)); } static void avm_pa_setup_est(void) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_est *e; del_timer(&ctx->est_timer); e = &ctx->fw_est; e->ewma_log = ctx->ewma_log; e->last_packets = ctx->stats.fw_pkts; e = &ctx->rx_est; e->ewma_log = ctx->ewma_log; e->last_packets = ctx->stats.rx_pkts; e = &ctx->overlimit_est; e->ewma_log = ctx->ewma_log; e->last_packets = ctx->stats.rx_overlimit; mod_timer(&ctx->est_timer, jiffies + ((HZ/4) << ctx->est_idx)); } static void avm_pa_unsetup_est(void) { struct avm_pa_global *ctx = &pa_glob; del_timer(&ctx->est_timer); } /* ------------------------------------------------------------------------ */ /* -------- value log ----------------------------------------------------- */ /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ static void avm_pa_enable(void) { avm_pa_setup_est(); } static void avm_pa_disable(void) { avm_pa_unsetup_est(); } #ifdef CONFIG_PROC_FS /* ------------------------------------------------------------------------ */ /* -------- procfs functions ---------------------------------------------- */ /* ------------------------------------------------------------------------ */ static int brief_show(struct seq_file *m, void *v) { pa_show_brief((pa_fprintf *)seq_printf, m); return 0; } static int brief_show_open(struct inode *inode, struct file *file) { return single_open(file, brief_show, PDE_DATA(inode)); } static const struct proc_ops brief_show_ops = { .proc_open = brief_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* brief_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static int memory_show(struct seq_file *m, void *v) { pa_show_memory((pa_fprintf *)seq_printf, m); return 0; } static int memory_show_open(struct inode *inode, struct file *file) { return single_open(file, memory_show, PDE_DATA(inode)); } static const struct proc_ops memory_show_ops = { .proc_open = memory_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* memory_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static int status_show(struct seq_file *m, void *v) { pa_show_status((pa_fprintf *)seq_printf, m); return 0; } static int status_show_open(struct inode *inode, struct file *file) { return single_open(file, status_show, PDE_DATA(inode)); } static const struct proc_ops status_show_ops = { .proc_open = status_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* status_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static int stats_show(struct seq_file *m, void *v) { pa_show_stats((pa_fprintf *)seq_printf, m); return 0; } static int stats_show_open(struct inode *inode, struct file *file) { return single_open(file, stats_show, PDE_DATA(inode)); } static const struct proc_ops stats_show_ops = { .proc_open = stats_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* stats_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static int pids_show(struct seq_file *m, void *v) { pa_show_pids((pa_fprintf *)seq_printf, m); return 0; } static int pids_show_open(struct inode *inode, struct file *file) { return single_open(file, pids_show, PDE_DATA(inode)); } static const struct proc_ops pids_show_ops = { .proc_open = pids_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* pids_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static int vpids_show(struct seq_file *m, void *v) { pa_show_vpids((pa_fprintf *)seq_printf, m); return 0; } static int vpids_show_open(struct inode *inode, struct file *file) { return single_open(file, vpids_show, PDE_DATA(inode)); } static const struct proc_ops vpids_show_ops = { .proc_open = vpids_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* vpids_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ struct handle_iter { unsigned short handle; }; static inline unsigned short next_session(struct avm_pa_global *ctx, unsigned short handle) { while (++handle < CONFIG_AVM_PA_MAX_SESSION) { struct avm_pa_session *sess = pa_session_get(handle); if (sess && avm_pa_session_is_selected(&ctx->show_filter, sess)) return handle; } return 0; } static void *sess_show_seq_start(struct seq_file *seq, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; struct handle_iter *it = seq->private; loff_t i; if ((it->handle = next_session(ctx, 0)) == 0) return 0; for (i = 0; i < *pos; i++) { if ((it->handle = next_session(ctx, it->handle)) == 0) return 0; } return PA_SESSION(pd, it->handle); } static void *sess_show_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct avm_pa_data *pd = &pa_data; struct handle_iter *it = seq->private; ++*pos; if ((it->handle = next_session(ctx, it->handle)) == 0) return 0; return PA_SESSION(pd, it->handle); } static void sess_show_seq_stop(struct seq_file *seq, void *v) { } static int sess_show_seq_show(struct seq_file *seq, void *v) { struct avm_pa_data *pd = &pa_data; const struct handle_iter *it = seq->private; seq_printf(seq, "\n"); pa_show_session(PA_SESSION(pd, it->handle), (pa_fprintf *)seq_printf, seq); return 0; } static struct seq_operations sess_show_seq_ops = { .start = sess_show_seq_start, .next = sess_show_seq_next, .stop = sess_show_seq_stop, .show = sess_show_seq_show, }; static int sess_show_open(struct inode *inode, struct file *file) { return seq_open_private(file, &sess_show_seq_ops, sizeof(struct handle_iter)); } static const struct proc_ops sess_show_ops = { .proc_open = sess_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, /* sess_show_open() uses seq_open_private() */ }; /* ------------------------------------------------------------------------ */ static inline unsigned short next_bsession(struct avm_pa_global *ctx, unsigned short handle) { while (++handle < CONFIG_AVM_PA_MAX_SESSION) { struct avm_pa_session *session; if ((session = pa_session_get(handle)) != 0 && session->bsession) return handle; } return 0; } static void *bsess_show_seq_start(struct seq_file *seq, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; loff_t i; if ((it->handle = next_bsession(ctx, 0)) == 0) return 0; for (i = 0; i < *pos; i++) { if ((it->handle = next_bsession(ctx, it->handle)) == 0) return 0; } return PA_BSESSION(ctx, it->handle); } static void *bsess_show_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; ++*pos; if ((it->handle = next_bsession(ctx, it->handle)) == 0) return 0; return PA_BSESSION(ctx, it->handle); } static void bsess_show_seq_stop(struct seq_file *seq, void *v) { } static int bsess_show_seq_show(struct seq_file *seq, void *v) { struct avm_pa_global *ctx = &pa_glob; const struct handle_iter *it = seq->private; seq_printf(seq, "\n"); pa_show_bsession(PA_BSESSION(ctx, it->handle), (pa_fprintf *)seq_printf, seq); return 0; } static struct seq_operations bsess_show_seq_ops = { .start = bsess_show_seq_start, .next = bsess_show_seq_next, .stop = bsess_show_seq_stop, .show = bsess_show_seq_show, }; static int bsess_show_open(struct inode *inode, struct file *file) { return seq_open_private(file, &bsess_show_seq_ops, sizeof(struct handle_iter)); } static const struct proc_ops bsess_show_ops = { .proc_open = bsess_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, /* bsess_show_open() uses seq_open_private() */ }; /* ------------------------------------------------------------------------ */ static inline int next_macaddrhash(struct avm_pa_global *ctx, int idx) { while (++idx < ARRAY_SIZE(ctx->macaddr_hashtab)) { if (!hlist_empty(&ctx->macaddr_hashtab[idx])) return idx; } return 0; } static void *macaddr_show_seq_start(struct seq_file *seq, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; loff_t i; if ((it->handle = next_macaddrhash(ctx, -1)) == 0) return 0; for (i = 0; i < *pos; i++) { if ((it->handle = next_macaddrhash(ctx, it->handle)) == 0) return 0; } return &ctx->macaddr_hashtab[it->handle]; } static void *macaddr_show_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; ++*pos; if ((it->handle = next_macaddrhash(ctx, it->handle)) == 0) return 0; return &ctx->macaddr_hashtab[it->handle]; } static void macaddr_show_seq_stop(struct seq_file *seq, void *v) { } static int macaddr_show_seq_show(struct seq_file *seq, void *v) { struct avm_pa_global *ctx = &pa_glob; const struct handle_iter *it = seq->private; struct avm_pa_macaddr *p; char buf[128]; rcu_read_lock_bh(); seq_printf(seq, "%04x:", it->handle); hlist_for_each_entry_rcu_bh(p, &ctx->macaddr_hashtab[it->handle], macaddr_list) { (void) pa_macaddr2str(p, buf, sizeof(buf)); seq_printf(seq, "\t%s\n", buf); } rcu_read_unlock_bh(); return 0; } static struct seq_operations macaddr_show_seq_ops = { .start = macaddr_show_seq_start, .next = macaddr_show_seq_next, .stop = macaddr_show_seq_stop, .show = macaddr_show_seq_show, }; static int macaddr_show_open(struct inode *inode, struct file *file) { return seq_open_private(file, &macaddr_show_seq_ops, sizeof(struct handle_iter)); } static const struct proc_ops macaddr_show_ops = { .proc_open = macaddr_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, /* macaddr_show_open() uses seq_open_private() */ }; /* ------------------------------------------------------------------------ */ static inline unsigned short next_pid(struct avm_pa_global *ctx, unsigned short handle) { while (++handle < CONFIG_AVM_PA_MAX_PID) { if (PA_PID(ctx, handle)->pid_handle) return handle; } return 0; } static void *pid_show_seq_start(struct seq_file *seq, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; loff_t i; if ((it->handle = next_pid(ctx, 0)) == 0) return 0; for (i = 0; i < *pos; i++) { if ((it->handle = next_pid(ctx, it->handle)) == 0) return 0; } return PA_PID(ctx, it->handle); } static void *pid_show_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct avm_pa_global *ctx = &pa_glob; struct handle_iter *it = seq->private; ++*pos; if ((it->handle = next_pid(ctx, it->handle)) == 0) return 0; return PA_PID(ctx, it->handle); } static void pid_show_seq_stop(struct seq_file *seq, void *v) { } static int hash_show_seq_show(struct seq_file *seq, void *v) { struct avm_pa_global *ctx = &pa_glob; const struct handle_iter *it = seq->private; struct avm_pa_pid *pid = PA_PID(ctx, it->handle); struct avm_pa_session *p; int i; seq_printf(seq, "PID%-3d: %s\n", it->handle, PA_PID(ctx, it->handle)->cfg.name); rcu_read_lock_bh(); for (i = 0; i < ARRAY_SIZE(pid->hash_sess); i++) { if (!hlist_empty(&pid->hash_sess[i])) { seq_printf(seq, "%3d: ", i); hlist_for_each_entry_rcu_bh(p, &pid->hash_sess[i], hash_list) seq_printf(seq, " %3d", p->session_handle); seq_printf(seq, "\n"); } } rcu_read_unlock_bh(); return 0; } static struct seq_operations hash_show_seq_ops = { .start = pid_show_seq_start, .next = pid_show_seq_next, .stop = pid_show_seq_stop, .show = hash_show_seq_show, }; static int hash_show_open(struct inode *inode, struct file *file) { return seq_open_private(file, &hash_show_seq_ops, sizeof(struct handle_iter)); } static const struct proc_ops hash_show_ops = { .proc_open = hash_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, /* hash_show_open() uses seq_open_private() */ }; /* ------------------------------------------------------------------------ */ static int prioack_show(struct seq_file *seq, void *v) { struct avm_pa_global *ctx = &pa_glob; int i, j; seq_printf(seq, "Packet Threshold : %u\n", ctx->prioack_thresh_packets); seq_printf(seq, "Ratio : %u\n", ctx->prioack_ratio); for (i = 1; i < CONFIG_AVM_PA_MAX_PID; ++i) { struct avm_pa_pid *pid = PA_PID(ctx, i); if (avm_pa_pid_tack_enabled(pid)) { seq_printf(seq, "PID%d: Detected ACKs : %u\n", pid->pid_handle, pid->prioack_acks); seq_printf(seq, "PID%d: Accelerated ACK : %u\n", pid->pid_handle, pid->prioack_accl_acks); for (j = 0; j < AVM_PA_MAX_PRIOS; ++j) { seq_printf(seq, "PID%d: TACK Priority[%d]: %x\n", pid->pid_handle, j, pid->prio_maps[AVM_PA_PRIO_MAP_TACK].prios[j]); } } if (avm_pa_pid_tget_enabled(pid)) { for (j = 0; j < AVM_PA_MAX_PRIOS; ++j) { seq_printf(seq, "PID%d: TGET Priority[%d]: %x\n", pid->pid_handle, j, pid->prio_maps[AVM_PA_PRIO_MAP_TGET].prios[j]); } } } return 0; } static int prioack_show_open(struct inode *inode, struct file *file) { return single_open(file, prioack_show, PDE_DATA(inode)); } static const struct proc_ops prioack_show_ops = { .proc_open = prioack_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* prioack_show_open() uses single_open() */ }; static int priomaps_show(struct seq_file *seq, void *v) { struct avm_pa_global *ctx = &pa_glob; int i, j, k; for (i = 1; i < CONFIG_AVM_PA_MAX_PID; ++i) { struct avm_pa_pid *pid = PA_PID(ctx, i); if (pid->pid_handle == 0) { continue; } seq_printf(seq, "PID %d Prio Maps\n", pid->pid_handle); for (j = 0; j < AVM_PA_COUNT_PRIO_MAPS; ++j) { if (!pid->prio_maps[j].enabled) { continue; } seq_printf(seq, "Prio Map[%d]\n", j); for (k = 0; k < AVM_PA_MAX_PRIOS; ++k) { seq_printf(seq, "Queue[%d]: %x\n", k, pid->prio_maps[j].prios[k]); } } } return 0; } static int priomaps_show_open(struct inode *inode, struct file *file) { return single_open(file, priomaps_show, PDE_DATA(inode)); } static const struct proc_ops priomaps_show_ops = { .proc_open = priomaps_show_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, /* priomaps_show_open() uses single_open() */ }; /* ------------------------------------------------------------------------ */ static avm_pid_handle pa_find_pid_by_name(const char *pidname) { struct avm_pa_global *ctx = &pa_glob; avm_pid_handle pid_handle; for (pid_handle = 1; pid_handle < CONFIG_AVM_PA_MAX_PID; pid_handle++) { struct avm_pa_pid *pid = PA_PID(ctx, pid_handle); if ( pid->pid_handle == pid_handle && strcmp(pid->cfg.name, pidname) == 0) { return pid_handle; } } return 0; } /* Normal strsep returns empty strings for duplicated delimtiers */ static char *strsep_nonempty(char **stringp, const char *delim) { char *p = strsep(stringp, delim); while (p && *p == 0) p = strsep(stringp, delim); return p; } static ssize_t avm_pa_write_cmds(struct file *file, const char __user *buffer, size_t count, loff_t *offset) { struct avm_pa_global *ctx = &pa_glob; char pp_cmd[101]; char* argv[10]; int argc; char* ptr_next_tok; char* ptr_next_line; avm_pid_handle pid_handle; /* Validate the length of data passed. */ if (count >= sizeof(pp_cmd)) return -E2BIG; /* Initialize the buffer before using it. */ memset ((void *)&pp_cmd[0], 0, sizeof(pp_cmd)); /* Copy from user space. */ if (copy_from_user (&pp_cmd, buffer, count)) return -EFAULT; ptr_next_line = pp_cmd; /* one command (with arguments) per line */ while ((ptr_next_tok = strsep_nonempty(&ptr_next_line, "\n"))) { /* exract arguments */ for (argc = 0; argc < ARRAY_SIZE(argv); argc++) argv[argc] = strsep_nonempty(&ptr_next_tok, " \t"); if (ptr_next_tok) return -E2BIG; /* enable | disable | testmode */ if (strcmp(argv[0], "enable") == 0) { ctx->fw_disabled = 0; ctx->disabled = 0; avm_pa_enable(); pr_debug("avm_pa: enabled\n"); } else if (strcmp(argv[0], "disable") == 0) { ctx->disabled = 1; ctx->fw_disabled = 1; avm_pa_disable(); avm_pa_flush_sessions(); pr_debug("avm_pa: disabled\n"); } else if (strcmp(argv[0], "testmode") == 0) { ctx->fw_disabled = 1; ctx->disabled = 0; avm_pa_disable(); pr_debug("avm_pa: testmode\n"); /* hw_enable | hw_disable */ } else if (strcmp(argv[0], "hw_enable") == 0) { ctx->hw_ppa_disabled = 0; pr_debug("avm_pa: hw enabled\n"); } else if (strcmp(argv[0], "hw_disable") == 0) { ctx->hw_ppa_disabled = 1; avm_pa_flush_hw_sessions(); pr_debug("avm_pa: hw disabled\n"); } else if (strcmp(argv[0], "filter") == 0) { int old = ctx->filter_enabled; if (argc > 1) ctx->filter_enabled = argc > 1 ? *argv[1] != '0' : 1; else ctx->filter_enabled = 1; if (ctx->filter_enabled && !old && !list_empty(&ctx->accel_filter)) avm_pa_flush_sessions(); } else if (strcmp(argv[0], "nofilter") == 0) { ctx->filter_enabled = 0; /* flush */ } else if (strcmp(argv[0], "flush") == 0) { if (argv[1]) { int i = 1; char *s; char buf[64] = ""; /* Rebuild original, space separated selector */ while ((s = argv[i++])) { strlcat(buf, " ", sizeof(buf)); strlcat(buf, s, sizeof(buf)); } avm_pa_flush_sessions_select(buf+1, GFP_KERNEL); pr_debug("avm_pa: flush \"%s\"\n", buf+1); } else { avm_pa_flush_sessions(); pr_debug("avm_pa: flush\n"); } /* rpsenable | rpsdisable */ } else if (strcmp(argv[0], "rpsenable") == 0) { #ifdef CONFIG_AVM_PA_RPS ctx->rps_enabled = 1; pr_debug("avm_pa: rps enabled\n"); #else pr_debug("avm_pa: rps support not built-in\n"); #endif } else if (strcmp(argv[0], "rpsdisable") == 0) { ctx->rps_enabled = 0; pr_debug("avm_pa: rps disabled\n"); /* nodbg */ } else if (strcmp(argv[0], "nodbg") == 0) { ctx->dbgcapture = 0; ctx->dbgsession = 0; ctx->dbgnosession = 0; ctx->dbgtrace = 0; ctx->dbgmatch = 0; ctx->dbgprioack = 0; ctx->dbgprioacktrace = 0; ctx->dbgstats = 0; pr_debug("avm_pa: all debugs off\n"); /* dbgcapture | nodbgcapture */ } else if (strcmp(argv[0], "dbgcapture") == 0) { ctx->dbgcapture = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgcapture") == 0) { ctx->dbgcapture = 0; pr_debug("avm_pa: %s\n", argv[0]); /* dbgsession | nodbgsession */ } else if (strcmp(argv[0], "dbgsession") == 0) { ctx->dbgsession = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgsession") == 0) { ctx->dbgsession = 0; pr_debug("avm_pa: %s\n", argv[0]); /* dbgnosession | nodbgnosession */ } else if (strcmp(argv[0], "dbgnosession") == 0) { ctx->dbgnosession = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgnosession") == 0) { ctx->dbgnosession = 0; pr_debug("avm_pa: %s\n", argv[0]); /* trace | notrace */ } else if (strcmp(argv[0], "trace") == 0) { #if AVM_PA_TRACE ctx->dbgtrace = 1; pr_debug("avm_pa: %s\n", argv[0]); #else pr_err("avm_pa: trace not compiled in\n"); #endif } else if (strcmp(argv[0], "notrace") == 0) { ctx->dbgtrace = 0; pr_debug("avm_pa: %s\n", argv[0]); /* dbgmatch | nodbgmatch */ } else if (strcmp(argv[0], "nodbgmatch") == 0) { ctx->dbgmatch = 0; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "dbgmatch") == 0) { ctx->dbgmatch = 1; pr_debug("avm_pa: %s\n", argv[0]); /* dbgprioack | nodbgprioack */ } else if (strcmp(argv[0], "dbgprioack") == 0) { ctx->dbgprioack = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgprioack") == 0) { ctx->dbgprioack = 0; pr_debug("avm_pa: %s\n", argv[0]); /* dbgprioacktrace | nodbgprioacktrace */ } else if (strcmp(argv[0], "dbgprioacktrace") == 0) { ctx->dbgprioacktrace = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgprioacktrace") == 0) { ctx->dbgprioacktrace = 0; pr_debug("avm_pa: %s\n", argv[0]); /* dbgstats | nodbgstats */ } else if (strcmp(argv[0], "dbgstats") == 0) { ctx->dbgstats = 1; pr_debug("avm_pa: %s\n", argv[0]); } else if (strcmp(argv[0], "nodbgstats") == 0) { ctx->dbgstats = 0; pr_debug("avm_pa: %s\n", argv[0]); } else if (strstr(argv[0], "bsessions")) { ctx->bsession_allowed = strcmp(argv[0], "nobsessions") != 0; if (!ctx->bsession_allowed) avm_pa_flush_bsessions(); pr_debug("avm_pa: %s: bsessions_allowed = %d\n", argv[0], ctx->bsession_allowed); /* pid */ } else if (strcmp(argv[0], "pid") == 0 && argv[1]) { struct net_device *dev = dev_get_by_name(&init_net, argv[1]); if (dev) { if (avm_pa_dev_register(dev) < 0) pr_err("%s: failed to register PA PID\n", argv[1]); dev_put(dev); } else { pr_err("avm_pa_write_cmds(pid): dev %s not found\n", argv[1]); } /* vpid */ } else if (strcmp(argv[0], "vpid") == 0 && argv[1]) { struct net_device *dev = dev_get_by_name(&init_net, argv[1]); if (dev) { struct avm_pa_vpid_cfg cfg; snprintf(cfg.name, sizeof(cfg.name), "%s", argv[1]); cfg.v4_mtu = 1500; cfg.v6_mtu = 1500; if (avm_pa_dev_vpid_register(AVM_PA_DEVINFO(dev), &cfg) < 0) pr_err("%s: failed to register PA VPID\n", argv[1]); dev_put(dev); } else { pr_err("avm_pa_write_cmds(vpid): dev %s not found\n", argv[1]); } /* unreg */ } else if (strcmp(argv[0], "unreg") == 0 && argv[1]) { int ret; struct net_device *dev = dev_get_by_name(&init_net, argv[1]); DECLARE_COMPLETION_ONSTACK(done); if (dev) { avm_pa_dev_unregister(AVM_PA_DEVINFO(dev), &done); ret = wait_for_completion_interruptible(&done); if (ret != 0) return ret; } else { pr_err("avm_pa_write_cmds(unreg): dev %s not found\n", argv[1]); } /* prioack * * Note: This interface is now partially obsolete (prioack ) * in favour of the priomap interface defined below. */ } else if (strcmp(argv[0], "prioack") == 0) { unsigned val = 0; if (argv[1]) { pr_debug("avm_pa: prioack %s %s %s\n", argv[1], argv[2] ? argv[2] : "", argv[3] ? argv[3] : ""); if (strcmp(argv[1], "enable") == 0) { if (argv[2] && argv[3]) { if ((pid_handle = pa_find_pid_by_name(argv[2])) != 0) { avm_pa_pid_activate_tcpackprio(pid_handle, 1, simple_strtoul(argv[3], 0, 0)); } else { pr_err("avm_pa: %s %s: %s not found\n", argv[0], argv[1], argv[2]); } } } else if (strcmp(argv[1], "disable") == 0) { if (argv[2]) { if ((pid_handle = pa_find_pid_by_name(argv[2])) != 0) { avm_pa_pid_activate_tcpackprio(pid_handle, 0, 0); } else { pr_err("avm_pa: prioack %s: %s not found\n", argv[1], argv[2]); } } else { int n; for (n = 1; n < CONFIG_AVM_PA_MAX_PID; ++n) { avm_pa_pid_activate_tcpackprio(n, 0, 0); } } } else if (strcmp(argv[1], "tgetenable") == 0) { if (argv[2] && argv[3]) { if ((pid_handle = pa_find_pid_by_name(argv[2])) != 0) { avm_pa_pid_activate_tgetprio(pid_handle, 1, simple_strtoul(argv[3], 0, 0)); } else { pr_err("avm_pa: %s %s: %s not found\n", argv[0], argv[1], argv[2]); } } } else if (strcmp(argv[1], "tgetdisable") == 0) { if (argv[2]) { if ((pid_handle = pa_find_pid_by_name(argv[2])) != 0) { avm_pa_pid_activate_tgetprio(pid_handle, 0, 0); } else { pr_err("avm_pa: %s %s: %s not found\n", argv[0], argv[1], argv[2]); } } else { int n; for (n = 1; n < CONFIG_AVM_PA_MAX_PID; ++n) { avm_pa_pid_activate_tgetprio(n, 0, 0); } } } else if (strcmp(argv[1], "pthresh") == 0) { if (argv[2]) val = simple_strtoul(argv[2], 0, 0); if (val) ctx->prioack_thresh_packets = val; } else if (strcmp(argv[1], "ratio") == 0) { if (argv[2]) val = simple_strtoul(argv[2], 0, 0); if (val) ctx->prioack_ratio = val; } else { pr_debug("avm_pa: prioack unknown command %s \n (available commands: enable,disable,psize,pthresh,prio,ratio)\n", argv[1]); } } /* The priomap interface supersedes the old prioack interface. */ } else if (strcmp(argv[0], "priomap") == 0) { if (argv[1] && argv[2] && argv[3]) { unsigned short prio_map = simple_strtoul(argv[1], 0, 0); if ((pid_handle = pa_find_pid_by_name(argv[2])) != 0) { /* Command: priomap * * Effect: Enables or disables the priority map attached to the * device specified by the 'dev' parameter. The 'priomap' parameter * MUST equal either AVM_PA_PRIO_MAP_TACK (= 0x0000) or * AVM_PA_PRIO_MAP_TGET (= 0x0001) as defined in avm_pa.h. */ if (strcmp(argv[3], "enable") == 0) { avm_pa_pid_prio_map_enable(pid_handle, prio_map, 1); } else if (strcmp(argv[3], "disable") == 0) { avm_pa_pid_prio_map_enable(pid_handle, prio_map, 0); /* Command: priomap reset * * Effect: Resets the priority map attached to the device specified by * the 'dev' parameter. The 'priomap' parameter MUST equal either * AVM_PA_PRIO_MAP_TACK (= 0x0000) or AVM_PA_PRIO_MAP_TGET (= 0x0001) * as defined in avm_pa.h. */ } else if (strcmp(argv[3], "reset") == 0) { avm_pa_pid_prio_map_reset(pid_handle, prio_map); /* Command: priomap set_prio * * Effect: Manipulates the priority map entry specified by the * 'queue' parameter which is stored in the priority map attached * to the device specified by the 'dev' parameter. The 'priomap' * parameter MUST equal either AVM_PA_PRIO_MAP_TACK (= 0x0000) or * AVM_PA_PRIO_MAP_TGET (= 0x0001) as defined in avm_pa.h. */ } else if (strcmp(argv[3], "setprio") == 0 && argv[4] && argv[5]) { avm_pa_pid_prio_map_set_prio_per_queue(pid_handle, prio_map, simple_strtoul(argv[4], 0, 0), /* queue */ simple_strtoul(argv[5], 0, 0)); /* prio */ } else { pr_err("avm_pa: priomap unknown command '%s'\n (available commands: enable, disable, reset, setprio)\n", argv[3]); } } else { pr_err("avm_pa: %s %s %s %s: %s not found\n", argv[0], argv[1], argv[2], argv[3], argv[2]); } } else { pr_err("avm_pa: %s: not enough parameters\n", argv[0]); } } else if (!strcmp(argv[0], "timeout")) { unsigned int val; if (!argv[1] || !argv[2]) return -EINVAL; val = simple_strtoul(argv[2], 0, 0); if (!strcmp(argv[1], "tcp")) ctx->tcp_timeout_secs = val; else if (!strcmp(argv[1], "udp")) ctx->udp_timeout_secs = val; else if (!strcmp(argv[1], "echo")) ctx->echo_timeout_secs = val; else if (!strcmp(argv[1], "bridge")) ctx->bridge_timeout_secs = val; else return -EINVAL; pr_info("avm_pa: setting timeout for %s to %u seconds\n", argv[1], val); } else if (!strcmp(argv[0], "debug")) { if (argv[1] && !strcmp(argv[1], "unreg-hw-pa")) { int ret; DECLARE_COMPLETION_ONSTACK(done); struct avm_hardware_pa tmp = ctx->hardware_pa; avm_pa_unregister_hardware_pa(&tmp, &done); ret = wait_for_completion_interruptible(&done); if (ret != 0) return ret; /* Give some time for new sessions in case this * test is driven with parallel sessions. Of course, * hardware sessions must not be created. */ msleep(100); if (pa_hw_pa_get()) { pa_hw_pa_put(); pr_err("avm_pa: hw_pa refcount should be 0 but really is %d\n", kref_read(&ctx->hw_pa_ref)); return -EIO; } ret = avm_pa_register_hardware_pa(&tmp); if (ret != 0) { pr_err("avm_pa: re-register hardware_pa failed: %d\n", ret); return ret; } } else if (argv[1] && !strcmp(argv[1], "non-pvid-macaddr")) { unsigned long val; char mac[ETH_ALEN]; struct avm_pa_macaddr *p; avm_pid_handle pid_handle; if (!argv[2] || !argv[3] || !argv[4]) return -EINVAL; if ((pid_handle = pa_find_pid_by_name(argv[2])) == 0) { pr_err("err pid_handle %s\n", argv[2]); return -EINVAL; } if (!mac_pton(argv[3], mac)) { pr_err("err mac %s\n", argv[3]); return -EINVAL; } if ((val = simple_strtoul(argv[4], 0, 0)) > 4095) { pr_err("err vlan %s\n", argv[4]); return -EINVAL; } /* If there is a suitable pvid macaddr, then this new macaddrs * should be visible in /proc/net/avm_pa/macaddrs even after unlinking * (with 0 references) and should disappear after flushing all sessions. */ local_bh_disable(); p = pa_macaddr_link(mac, pid_handle, 0, val); if (p) pa_macaddr_unlink(p); local_bh_enable(); if (!p) return -EIO; } } else { pr_err("avm_pa_write_cmds: %s: unknown command\n", argv[0]); } } return count; } /* ------------------------------------------------------------------------ */ const struct proc_ops avm_pa_control_ops = { .proc_write = avm_pa_write_cmds, }; static ssize_t avm_pa_read_show_filter(struct file *file, char __user *buffer, size_t count, loff_t *offset) { struct list_head *selector_list = PDE_DATA(file_inode(file)); ssize_t ret; if (*offset || list_empty(selector_list)) return 0; ret = avm_pa_dump_selector_user(selector_list, buffer, count); *offset += ret; return ret; } static ssize_t avm_pa_write_show_filter(struct file *file, const char __user *buffer, size_t count, loff_t *offset) { ssize_t ret; struct list_head *selector_list = PDE_DATA(file_inode(file)); ret = avm_pa_parse_selector_user(selector_list, buffer, count); if (ret < 0) return ret; *offset += ret; return ret; } static const struct proc_ops selector_ops = { .proc_read = avm_pa_read_show_filter, .proc_write = avm_pa_write_show_filter, }; static struct proc_dir_entry *dir_entry = 0; static void __init avm_pa_proc_init(void) { struct avm_pa_global *ctx = &pa_glob; dir_entry = proc_net_mkdir(&init_net, "avm_pa", init_net.proc_net); proc_create("control", S_IFREG|S_IWUSR, dir_entry, &avm_pa_control_ops); proc_create("brief", S_IRUGO, dir_entry, &brief_show_ops); proc_create("memory", S_IRUGO, dir_entry, &memory_show_ops); proc_create("status", S_IRUGO, dir_entry, &status_show_ops); proc_create("stats", S_IRUGO, dir_entry, &stats_show_ops); proc_create("pids", S_IRUGO, dir_entry, &pids_show_ops); proc_create("vpids", S_IRUGO, dir_entry, &vpids_show_ops); proc_create("sessions", S_IRUGO, dir_entry, &sess_show_ops); proc_create_data("filter", S_IRUGO|S_IWUSR, dir_entry, &selector_ops, &ctx->accel_filter); proc_create_data("xsession", S_IRUGO|S_IWUSR, dir_entry, &selector_ops, &ctx->show_filter); proc_create("bsessions", S_IRUGO, dir_entry, &bsess_show_ops); proc_create("macaddrs", S_IRUGO, dir_entry, &macaddr_show_ops); proc_create("hashes", S_IRUGO, dir_entry, &hash_show_ops); proc_create("prioack", S_IRUGO, dir_entry, &prioack_show_ops); proc_create("priomaps", S_IRUGO, dir_entry, &priomaps_show_ops); avm_pa_sg_proc_init(dir_entry); } static void __exit avm_pa_proc_exit(void) { remove_proc_entry("control", dir_entry); remove_proc_entry("brief", dir_entry); remove_proc_entry("memory", dir_entry); remove_proc_entry("status", dir_entry); remove_proc_entry("stats", dir_entry); remove_proc_entry("pids", dir_entry); remove_proc_entry("vpids", dir_entry); remove_proc_entry("sessions", dir_entry); remove_proc_entry("filter", dir_entry); remove_proc_entry("xsession", dir_entry); remove_proc_entry("bsessions", dir_entry); remove_proc_entry("macaddrs", dir_entry); remove_proc_entry("hashes", dir_entry); remove_proc_entry("prioack", dir_entry); remove_proc_entry("priomaps", dir_entry); avm_pa_sg_proc_exit(dir_entry); remove_proc_entry("avm_pa", init_net.proc_net); } #endif /* ------------------------------------------------------------------------ */ /* -------- misc device for capture tracking ------------------------------ */ /* ------------------------------------------------------------------------ */ static ssize_t avm_pa_misc_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static unsigned int avm_pa_misc_poll(struct file *file, poll_table *wait) { return 0; } static int avm_pa_misc_open(struct inode *inode, struct file *file) { struct avm_pa_global *ctx = &pa_glob; atomic_inc(&ctx->misc_is_open); return 0; } static int avm_pa_misc_release(struct inode *inode, struct file *file) { struct avm_pa_global *ctx = &pa_glob; if (atomic_read(&ctx->misc_is_open) > 0) atomic_dec(&ctx->misc_is_open); return 0; } static const struct file_operations avm_pa_misc_fops = { .llseek = no_llseek, .read = avm_pa_misc_read, .poll = avm_pa_misc_poll, .open = avm_pa_misc_open, .release = avm_pa_misc_release, }; static struct miscdevice avm_pa_misc_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "avm_pa", .fops = &avm_pa_misc_fops }; /* ------------------------------------------------------------------------ */ /* -------- init & exit functions ----------------------------------------- */ /* ------------------------------------------------------------------------ */ /* * early init is called before the init functions of all device drivers. */ int __init avm_pa_early_init(void) { struct avm_pa_global *ctx = &pa_glob; int i; pr_info("AVM PA for %s (early init)\n", linux_banner); for (i = 0; i < AVM_PA_LIST_MAX; i++) INIT_LIST_HEAD(&ctx->sess_list[i].sessions); for (i = 0; i < AVM_PA_MAX_MACADDR; i++) { INIT_HLIST_HEAD(&ctx->macaddr_hashtab[i]); } INIT_HLIST_HEAD(&ctx->egress_freelist); #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0) setup_timer(&ctx->tick_timer, pa_session_tick, 0); setup_timer(&ctx->est_timer, avm_pa_est_timer, 0); #else timer_setup(&ctx->tick_timer, pa_session_tick, 0); timer_setup(&ctx->est_timer, avm_pa_est_timer, 0); #endif skb_queue_head_init(&ctx->irqqueue); tasklet_init(&ctx->irqtasklet, avm_pa_irq_tasklet, 0); avm_pa_init_freelist(); avm_pa_sg_init(); return 0; } /* * avm_pa_init is called together with the init functions * of the device drivers. */ int __init avm_pa_init(void) { struct avm_pa_global *ctx = &pa_glob; int __maybe_unused i; { /* complain if avm_pa_pkt_info or avm_pa_dev_info crosses the reserved * area (usually 256 and 32 bytes respectively) */ struct sk_buff *skb __maybe_unused = NULL; struct net_device *dev __maybe_unused = NULL; struct net *net __maybe_unused = NULL; #if defined(AVM_PKT_INFO_MAX) BUILD_BUG_ON(sizeof(struct avm_pa_pkt_info) > AVM_PKT_INFO_MAX); #else BUILD_BUG_ON(sizeof(struct avm_pa_pkt_info) > sizeof(skb->avm_pa)); #endif BUILD_BUG_ON(sizeof(struct avm_pa_dev_info) > sizeof(dev->avm_pa)); /* net->avm_pa holds two avm_pa_dev_info (ipv4 and ipv6) */ BUILD_BUG_ON(2*sizeof(struct avm_pa_dev_info) > sizeof(net->avm_pa)); } pr_info("AVM PA %s for Linux %s (late init)\n", AVM_PA_VERSION, linux_banner); if (misc_register(&avm_pa_misc_dev) < 0) pr_err("avm_pa: misc_register() failed"); #ifdef CONFIG_PROC_FS INIT_LIST_HEAD(&ctx->accel_filter); INIT_LIST_HEAD(&ctx->show_filter); avm_pa_proc_init(); #endif #ifdef CONFIG_AVM_PA_RPS for (i = 0; i < CONFIG_AVM_PA_RPS_QUEUES; i++) { ctx->rps[i].q = NULL; ctx->rps[i].r_sz = PA_RPS_REVERSE_SIZE; ctx->rps[i].r = kmalloc_array(PA_RPS_REVERSE_SIZE, sizeof(struct sk_buff *), GFP_KERNEL); tasklet_init(&ctx->rps[i].ipi_task, pa_rps_ipi_task, (unsigned long) &ctx->rps[i]); tasklet_init(&ctx->rps[i].dequeue_task, pa_rps_dequeue_task, (unsigned long) &ctx->rps[i]); INIT_CSD(&ctx->rps[i].csd, (smp_call_func_t) tasklet_schedule, &ctx->rps[i].dequeue_task); if (!ctx->rps[i].r) { pr_crit("RPS disabled due to kmalloc failure"); ctx->rps_enabled = 0; } } #endif avm_pa_netdev_init(); avm_pa_enable(); ctx->disabled = ctx->fw_disabled = 0; return 0; } void __exit avm_pa_exit(void) { struct avm_pa_global *ctx = &pa_glob; struct sk_buff *skb; int __maybe_unused i; ctx->disabled = 1; ctx->fw_disabled = 1; avm_pa_disable(); tasklet_kill(&ctx->irqtasklet); while ((skb = skb_dequeue(&ctx->irqqueue)) != 0) kfree_skb(skb); del_timer_sync(&ctx->tick_timer); pa_session_gc_once(); pa_session_gc_once(); avm_pa_sg_exit(); #ifdef CONFIG_AVM_PA_RPS for (i = 0; i < CONFIG_AVM_PA_RPS_QUEUES; i++) { kfree(ctx->rps[i].r); } #endif #ifdef CONFIG_PROC_FS avm_pa_proc_exit(); avm_pa_selector_free(&ctx->show_filter); avm_pa_selector_free(&ctx->accel_filter); #endif misc_deregister(&avm_pa_misc_dev); avm_pa_reset_stats(); } #ifdef CONFIG_IFX_PPA void avm_pa_disable_atm_hw_tx_acl(void){ int n; struct avm_pa_global *ctx = &pa_glob; for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = PA_PID(ctx, n); if (pid->pid_handle == 0) continue; if (pid->hw && pid->hw->atmvcc){ pid->hw->flags |= AVMNET_DEVICE_IFXPPA_DISABLE_TX_ACL; } } } EXPORT_SYMBOL(avm_pa_disable_atm_hw_tx_acl); void avm_pa_enable_atm_hw_tx_acl(void){ struct avm_pa_global *ctx = &pa_glob; int n; for (n=1; n < CONFIG_AVM_PA_MAX_PID; n++) { struct avm_pa_pid *pid = PA_PID(ctx, n); if (pid->pid_handle == 0) continue; if (pid->hw && pid->hw->atmvcc){ pid->hw->flags &= ~AVMNET_DEVICE_IFXPPA_DISABLE_TX_ACL; } } } EXPORT_SYMBOL(avm_pa_enable_atm_hw_tx_acl); #endif subsys_initcall(avm_pa_early_init); /* init avm pa before devices */ module_init(avm_pa_init); module_exit(avm_pa_exit);