--- zzzz-none-000/linux-3.10.107/net/ipv6/route.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ipv6/route.c 2021-02-04 17:41:59.000000000 +0000 @@ -54,10 +54,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include @@ -66,13 +70,13 @@ #endif enum rt6_nud_state { - RT6_NUD_FAIL_HARD = -2, - RT6_NUD_FAIL_SOFT = -1, + RT6_NUD_FAIL_HARD = -3, + RT6_NUD_FAIL_PROBE = -2, + RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest); +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -83,14 +87,18 @@ static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_policy_failed(struct sk_buff *skb); +static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void rt6_dst_from_metrics_check(struct rt6_info *rt); +static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -102,34 +110,86 @@ const struct in6_addr *gwaddr, int ifindex); #endif -static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); + +static void rt6_uncached_list_add(struct rt6_info *rt) { - struct rt6_info *rt = (struct rt6_info *) dst; - struct inet_peer *peer; - u32 *p = NULL; + struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - if (!(rt->dst.flags & DST_HOST)) - return dst_cow_metrics_generic(dst, old); + rt->dst.flags |= DST_NOCACHE; + rt->rt6i_uncached_list = ul; + + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt6i_uncached, &ul->head); + spin_unlock_bh(&ul->lock); +} + +static void rt6_uncached_list_del(struct rt6_info *rt) +{ + if (!list_empty(&rt->rt6i_uncached)) { + struct uncached_list *ul = rt->rt6i_uncached_list; - peer = rt6_get_peer_create(rt); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; + spin_lock_bh(&ul->lock); + list_del(&rt->rt6i_uncached); + spin_unlock_bh(&ul->lock); + } +} - p = peer->metrics; - if (inet_metrics_new(peer)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); +static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) +{ + struct net_device *loopback_dev = net->loopback_dev; + int cpu; - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); + if (dev == loopback_dev) + return; - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + struct rt6_info *rt; + + spin_lock_bh(&ul->lock); + list_for_each_entry(rt, &ul->head, rt6i_uncached) { + struct inet6_dev *rt_idev = rt->rt6i_idev; + struct net_device *rt_dev = rt->dst.dev; + + if (rt_idev->dev == dev) { + rt->rt6i_idev = in6_dev_get(loopback_dev); + in6_dev_put(rt_idev); + } + + if (rt_dev == dev) { + rt->dst.dev = loopback_dev; + dev_hold(rt->dst.dev); + dev_put(rt_dev); + } } + spin_unlock_bh(&ul->lock); } - return p; +} + +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) +{ + return dst_metrics_write_ptr(rt->dst.from); +} + +/* Define route change notification chain. */ +ATOMIC_NOTIFIER_HEAD(ip6route_chain); + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) + return NULL; + else + return dst_cow_metrics_generic(dst, old); } static inline const void *choose_neigh_daddr(struct rt6_info *rt, @@ -161,7 +221,6 @@ static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, - .protocol = cpu_to_be16(ETH_P_IPV6), .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, @@ -195,22 +254,15 @@ { } -static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, - unsigned long old) -{ - return NULL; -} - static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, - .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, .mtu = ip6_blackhole_mtu, .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, - .cow_metrics = ip6_rt_blackhole_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .neigh_lookup = ip6_neigh_lookup, }; @@ -250,6 +302,21 @@ .rt6i_ref = ATOMIC_INIT(1), }; +static const struct rt6_info ip6_policy_failed_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EACCES, + .input = ip6_pkt_policy_failed, + .output = ip6_pkt_policy_failed_out, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), @@ -257,7 +324,7 @@ .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard, + .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -267,36 +334,67 @@ #endif +static void rt6_info_init(struct rt6_info *rt) +{ + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); +} + /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) { - struct dst_entry *dst = &rt->dst; + if (rt) + rt6_info_init(rt); + + return rt; +} - memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid(net); - INIT_LIST_HEAD(&rt->rt6i_siblings); - rt->rt6i_nsiblings = 0; +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; + + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } } + return rt; } static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; + struct inet6_dev *idev; - if (!(rt->dst.flags & DST_HOST)) - dst_destroy_metrics_generic(dst); + dst_destroy_metrics_generic(dst); + free_percpu(rt->rt6i_pcpu); + rt6_uncached_list_del(rt); + idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -304,27 +402,6 @@ dst->from = NULL; dst_release(from); - - if (rt6_has_peer(rt)) { - struct inet_peer *peer = rt6_peer_ptr(rt); - inet_putpeer(peer); - } -} - -void rt6_bind_peer(struct rt6_info *rt, int create) -{ - struct inet_peer_base *base; - struct inet_peer *peer; - - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; - - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); - } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -347,6 +424,14 @@ } } +static bool __rt6_check_expired(const struct rt6_info *rt) +{ + if (rt->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, rt->dst.expires); + else + return false; +} + static bool rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) { @@ -358,12 +443,6 @@ return false; } -static bool rt6_need_strict(const struct in6_addr *daddr) -{ - return ipv6_addr_type(daddr) & - (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); -} - /* Multipath route selection: * Hash based function using packet header and flowlabel. * Adapted from fib_info_hashfn() @@ -371,35 +450,12 @@ static int rt6_info_hash_nhsfn(unsigned int candidate_count, const struct flowi6 *fl6) { - unsigned int val = fl6->flowi6_proto; - - val ^= ipv6_addr_hash(&fl6->daddr); - val ^= ipv6_addr_hash(&fl6->saddr); - - /* Work only if this not encapsulated */ - switch (fl6->flowi6_proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_SCTP: - val ^= (__force u16)fl6->fl6_sport; - val ^= (__force u16)fl6->fl6_dport; - break; - - case IPPROTO_ICMPV6: - val ^= (__force u16)fl6->fl6_icmp_type; - val ^= (__force u16)fl6->fl6_icmp_code; - break; - } - /* RFC6438 recommands to use flowlabel */ - val ^= (__force u32)fl6->flowlabel; - - /* Perhaps, we need to tune, this function? */ - val = val ^ (val >> 7) ^ (val >> 12); - return val % candidate_count; + return get_hash_from_flowi6(fl6) % candidate_count; } static struct rt6_info *rt6_multipath_select(struct rt6_info *match, - struct flowi6 *fl6) + struct flowi6 *fl6, int oif, + int strict) { struct rt6_info *sibling, *next_sibling; int route_choosen; @@ -413,6 +469,8 @@ &match->rt6i_siblings, rt6i_siblings) { route_choosen--; if (route_choosen == 0) { + if (rt6_score_route(sibling, oif, strict) < 0) + break; match = sibling; break; } @@ -445,10 +503,10 @@ if (dev->flags & IFF_LOOPBACK) { if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { - if (flags & RT6_LOOKUP_F_IFACE && oif) + if (flags & RT6_LOOKUP_F_IFACE) continue; - if (local && (!oif || - local->rt6i_idev->dev->ifindex == oif)) + if (local && + local->rt6i_idev->dev->ifindex == oif) continue; } local = sprt; @@ -485,13 +543,14 @@ container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); dev_put(work->dev); - kfree(w); + kfree(work); } static void rt6_probe(struct rt6_info *rt) { + struct __rt6_probe_work *work; struct neighbour *neigh; /* * Okay, this does not seem to be appropriate @@ -506,34 +565,33 @@ rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { - write_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) goto out; - } - - if (!neigh || - time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct __rt6_probe_work *work; + work = NULL; + write_lock(&neigh->lock); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, + neigh->updated + + rt->rt6i_idev->cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + __neigh_set_probe_once(neigh); + } + write_unlock(&neigh->lock); + } else { work = kmalloc(sizeof(*work), GFP_ATOMIC); + } - if (neigh && work) - neigh->updated = jiffies; - - if (neigh) - write_unlock(&neigh->lock); + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); + } - if (work) { - INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; - schedule_work(&work->work); - } - } else { out: - write_unlock(&neigh->lock); - } rcu_read_unlock_bh(); } #else @@ -574,11 +632,13 @@ #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(neigh->nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; + else + ret = RT6_NUD_FAIL_PROBE; #endif read_unlock(&neigh->lock); } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? - RT6_NUD_SUCCEED : RT6_NUD_FAIL_SOFT; + RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock_bh(); @@ -610,21 +670,28 @@ { int m; bool match_do_rr = false; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && + idev->cnf.ignore_routes_with_linkdown) + goto out; if (rt6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); - if (m == RT6_NUD_FAIL_SOFT && !IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { + if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ - } else if (m < 0) { + } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(rt); + /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; @@ -639,15 +706,33 @@ u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match; + struct rt6_info *rt, *match, *cont; int mpri = -1; match = NULL; - for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + cont = NULL; + for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + match = find_match(rt, oif, strict, &mpri, match, do_rr); - for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + } + + for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + + match = find_match(rt, oif, strict, &mpri, match, do_rr); + } + + if (match || !cont) + return match; + + for (rt = cont; rt; rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -681,6 +766,11 @@ return match ? match : net->ipv6.ip6_null_entry; } +static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); +} + #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) @@ -757,23 +847,22 @@ } #endif -#define BACKTRACK(__net, saddr) \ -do { \ - if (rt == __net->ipv6.ip6_null_entry) { \ - struct fib6_node *pn; \ - while (1) { \ - if (fn->fn_flags & RTN_TL_ROOT) \ - goto out; \ - pn = fn->parent; \ - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \ - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \ - else \ - fn = pn; \ - if (fn->fn_flags & RTN_RTINFO) \ - goto restart; \ - } \ - } \ -} while (0) +static struct fib6_node* fib6_backtrack(struct fib6_node *fn, + struct in6_addr *saddr) +{ + struct fib6_node *pn; + while (1) { + if (fn->fn_flags & RTN_TL_ROOT) + return NULL; + pn = fn->parent; + if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) + fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + else + fn = pn; + if (fn->fn_flags & RTN_RTINFO) + return fn; + } +} static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, @@ -788,16 +877,19 @@ rt = fn->leaf; rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6); - BACKTRACK(net, &fl6->saddr); -out: + rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto restart; + } dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); return rt; } -struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, +struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags) { return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); @@ -827,7 +919,6 @@ return NULL; } - EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. @@ -836,14 +927,15 @@ be destroyed. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, + struct mx6_config *mxc) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info); + err = fib6_add(&table->tb6_root, rt, info, mxc); write_unlock_bh(&table->tb6_lock); return err; @@ -851,15 +943,15 @@ int ip6_ins_rt(struct rt6_info *rt) { - struct nl_info info = { - .nl_net = dev_net(rt->dst.dev), - }; - return __ip6_ins_rt(rt, &info); + struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; + struct mx6_config mxc = { .mx = NULL, }; + + return __ip6_ins_rt(rt, &info, &mxc); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -867,17 +959,25 @@ * Clone the route. */ - rt = ip6_rt_copy(ort, daddr); + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; - if (rt) { - if (!(rt->rt6i_flags & RTF_GATEWAY)) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) - rt->rt6i_flags |= RTF_ANYCAST; - } + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0); - rt->rt6i_flags |= RTF_CACHE; + if (!rt) + return NULL; + + ip6_rt_copy_init(rt, ort); + rt->rt6i_flags |= RTF_CACHE; + rt->rt6i_metric = 0; + rt->dst.flags |= DST_HOST; + rt->rt6i_dst.addr = *daddr; + rt->rt6i_dst.plen = 128; + if (!rt6_is_gw_or_nonexthop(ort)) { + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; @@ -889,85 +989,165 @@ return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, - const struct in6_addr *daddr) +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { - struct rt6_info *rt = ip6_rt_copy(ort, daddr); + struct rt6_info *pcpu_rt; - if (rt) - rt->rt6i_flags |= RTF_CACHE; - return rt; + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +/* It should be called with read_lock_bh(&tb6_lock) acquired */ +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, **p; + + p = this_cpu_ptr(rt->rt6i_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) { + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + } + return pcpu_rt; +} + +static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct rt6_info *pcpu_rt, *prev, **p; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + dst_hold(&net->ipv6.ip6_null_entry->dst); + return net->ipv6.ip6_null_entry; + } + + read_lock_bh(&table->tb6_lock); + if (rt->rt6i_pcpu) { + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + } else { + /* rt has been removed from the fib6 tree + * before we have a chance to acquire the read_lock. + * In this case, don't brother to create a pcpu rt + * since rt is going away anyway. The next + * dst_check() will trigger a re-lookup. + */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = rt; + } + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + read_unlock_bh(&table->tb6_lock); + return pcpu_rt; } static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { - struct fib6_node *fn; - struct rt6_info *rt, *nrt; + struct fib6_node *fn, *saved_fn; + struct rt6_info *rt; int strict = 0; - int attempts = 3; - int err; - int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; strict |= flags & RT6_LOOKUP_F_IFACE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; -relookup: read_lock_bh(&table->tb6_lock); -restart_2: fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + saved_fn = fn; -restart: - rt = rt6_select(fn, oif, strict | reachable); - if (rt->rt6i_nsiblings && oif == 0) - rt = rt6_multipath_select(rt, fl6); - BACKTRACK(net, &fl6->saddr); - if (rt == net->ipv6.ip6_null_entry || - rt->rt6i_flags & RTF_CACHE) - goto out; + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + oif = 0; - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); +redo_rt6_select: + rt = rt6_select(fn, oif, strict); + if (rt->rt6i_nsiblings) + rt = rt6_multipath_select(rt, fl6, oif, strict); + if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto redo_rt6_select; + else if (strict & RT6_LOOKUP_F_REACHABLE) { + /* also consider unreachable route */ + strict &= ~RT6_LOOKUP_F_REACHABLE; + fn = saved_fn; + goto redo_rt6_select; + } + } - if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) - nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST)) - nrt = rt6_alloc_clone(rt, &fl6->daddr); - else - goto out2; - ip6_rt_put(rt); - rt = nrt ? : net->ipv6.ip6_null_entry; + if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); + + rt6_dst_from_metrics_check(rt); + return rt; + } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && + !(rt->rt6i_flags & RTF_GATEWAY))) { + /* Create a RTF_CACHE clone which will not be + * owned by the fib6 tree. It is for the special case where + * the daddr in the skb during the neighbor look-up is different + * from the fl6->daddr used to look-up route here. + */ - dst_hold(&rt->dst); - if (nrt) { - err = ip6_ins_rt(nrt); - if (!err) - goto out2; - } + struct rt6_info *uncached_rt; - if (--attempts <= 0) - goto out2; + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - /* - * Race condition! In the gap, when table->tb6_lock was - * released someone could insert this route. Relookup. - */ - ip6_rt_put(rt); - goto relookup; + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + dst_release(&rt->dst); -out: - if (reachable) { - reachable = 0; - goto restart_2; - } - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); -out2: - rt->dst.lastuse = jiffies; - rt->dst.__use++; + if (uncached_rt) + rt6_uncached_list_add(uncached_rt); + else + uncached_rt = net->ipv6.ip6_null_entry; - return rt; + dst_hold(&uncached_rt->dst); + return uncached_rt; + + } else { + /* Get a percpu copy */ + + struct rt6_info *pcpu_rt; + + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = rt6_get_pcpu_route(rt); + + if (pcpu_rt) { + read_unlock_bh(&table->tb6_lock); + } else { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + */ + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + pcpu_rt = rt6_make_pcpu_route(rt); + dst_release(&rt->dst); + } + + return pcpu_rt; + + } } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -991,8 +1171,9 @@ const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { - .flowi6_iif = skb->dev->ifindex, + .flowi6_iif = l3mdev_fib_oif(skb->dev), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), @@ -1000,6 +1181,10 @@ .flowi6_proto = iph->nexthdr, }; + tun_info = skb_tunnel_info(skb); + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) + fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } @@ -1009,25 +1194,31 @@ return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, - struct flowi6 *fl6) +struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, + struct flowi6 *fl6, int flags) { - int flags = 0; + struct dst_entry *dst; + bool any_src; + + dst = l3mdev_rt6_dst_by_oif(net, fl6); + if (dst) + return dst; fl6->flowi6_iif = LOOPBACK_IFINDEX; - if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + any_src = ipv6_addr_any(&fl6->saddr); + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || + (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; - if (!ipv6_addr_any(&fl6->saddr)) + if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } - -EXPORT_SYMBOL(ip6_route_output); +EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { @@ -1036,25 +1227,20 @@ rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { - new = &rt->dst; - - memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); - rt6_init_peer(rt, net->ipv6.peers); + rt6_info_init(rt); + new = &rt->dst; new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_out; - if (dst_metrics_read_only(&ort->dst)) - new->_metrics = ort->dst._metrics; - else - dst_copy_metrics(new, &ort->dst); + dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1073,6 +1259,34 @@ * Destination cache support functions */ +static void rt6_dst_from_metrics_check(struct rt6_info *rt) +{ + if (rt->dst.from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); +} + +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +{ + if (!__rt6_check_expired(rt) && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1083,16 +1297,14 @@ * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev))) - return NULL; - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - - if (rt6_check_expired(rt)) - return NULL; + rt6_dst_from_metrics_check(rt); - return dst; + if (rt->rt6i_flags & RTF_PCPU || + (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) + return rt6_dst_from_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -1123,32 +1335,76 @@ if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) +{ + struct net *net = dev_net(rt->dst.dev); + + rt->rt6i_flags |= RTF_MODIFIED; + rt->rt6i_pmtu = mtu; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); +} + +static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { - struct rt6_info *rt6 = (struct rt6_info*)dst; + return !(rt->rt6i_flags & RTF_CACHE) && + (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); +} + +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, + const struct ipv6hdr *iph, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info *)dst; + + if (rt6->rt6i_flags & RTF_LOCAL) + return; dst_confirm(dst); - if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { - struct net *net = dev_net(dst->dev); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu >= dst_mtu(dst)) + return; - rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (!rt6_cache_allowed_for_pmtu(rt6)) { + rt6_do_update_pmtu(rt6, mtu); + } else { + const struct in6_addr *daddr, *saddr; + struct rt6_info *nrt6; - dst_metric_set(dst, RTAX_MTU, mtu); - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + return; + } + nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + if (nrt6) { + rt6_do_update_pmtu(nrt6, mtu); + + /* ip6_ins_rt(nrt6) will bump the + * rt6->rt6i_node->fn_sernum + * which will fail the next rt6_check() and + * invalidate the sk->sk_dst_cache. + */ + ip6_ins_rt(nrt6); + } } } +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); +} + void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark) { @@ -1158,15 +1414,14 @@ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.flowi6_flags = 0; + fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -1178,6 +1433,83 @@ } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); +/* Handle redirects */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 4861 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt6_check_expired(rt)) + continue; + if (rt->dst.error) + break; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->dst.dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; + } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + else if (rt->dst.error) { + rt = net->ipv6.ip6_null_entry; + goto out; + } + + if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto restart; + } + +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct dst_entry *ip6_route_redirect(struct net *net, + const struct flowi6 *fl6, + const struct in6_addr *gateway) +{ + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip6rd_flowi rdfl; + + rdfl.fl6 = *fl6; + rdfl.gateway = *gateway; + + return fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); +} + void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; @@ -1185,20 +1517,39 @@ struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; - fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); - dst = ip6_route_output(net, NULL, &fl6); - if (!dst->error) - rt6_do_redirect(dst, NULL, skb); + dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.daddr = msg->dest; + fl6.saddr = iph->daddr; + + dst = ip6_route_redirect(net, &fl6, &iph->saddr); + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} + void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); @@ -1229,12 +1580,17 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + goto out; + mtu = IPV6_MIN_MTU; rcu_read_lock(); @@ -1261,7 +1617,7 @@ if (unlikely(!idev)) return ERR_PTR(-ENODEV); - rt = ip6_dst_alloc(net, dev, 0, NULL); + rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); @@ -1348,7 +1704,7 @@ goto out; net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size); + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; @@ -1357,44 +1713,76 @@ return entries > rt_max_size; } -int ip6_dst_hoplimit(struct dst_entry *dst) +static int ip6_convert_metrics(struct mx6_config *mxc, + const struct fib6_config *cfg) { - int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - if (hoplimit == 0) { - struct net_device *dev = dst->dev; - struct inet6_dev *idev; - - rcu_read_lock(); - idev = __in6_dev_get(dev); - if (idev) - hoplimit = idev->cnf.hop_limit; - else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; - rcu_read_unlock(); + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + u32 *mp; + + if (!cfg->fc_mx) + return 0; + + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (unlikely(!mp)) + return -ENOMEM; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + goto err; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; + + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); } - return hoplimit; -} -EXPORT_SYMBOL(ip6_dst_hoplimit); -/* - * - */ + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + } -int ip6_route_add(struct fib6_config *cfg) + mxc->mx = mp; + return 0; + err: + kfree(mp); + return -EINVAL; +} + +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { - int err; struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; int addr_type; + int err = -EINVAL; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) - return -EINVAL; + goto out; #ifndef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len) - return -EINVAL; + goto out; #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -1424,7 +1812,8 @@ if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); + rt = ip6_dst_alloc(net, NULL, + (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); if (!rt) { err = -ENOMEM; @@ -1452,19 +1841,30 @@ rt->dst.output = ip6_output; - ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); - rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) - rt->dst.flags |= DST_HOST; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; - if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) { - u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); - if (!metrics) { - err = -ENOMEM; + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, AF_INET6, cfg, + &lwtstate); + if (err) goto out; + rt->dst.lwtstate = lwtstate_get(lwtstate); + if (lwtunnel_output_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_output = rt->dst.output; + rt->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input; } - dst_init_metrics(&rt->dst, metrics, 0); } + + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->rt6i_dst.plen = cfg->fc_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->dst.flags |= DST_HOST; + #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->rt6i_src.plen = cfg->fc_src_len; @@ -1497,7 +1897,7 @@ switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard; + rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -1505,10 +1905,17 @@ rt->dst.output = ip6_pkt_prohibit_out; rt->dst.input = ip6_pkt_prohibit; break; + case RTN_POLICY_FAILED: + rt->dst.error = -EACCES; + rt->dst.output = ip6_pkt_policy_failed_out; + rt->dst.input = ip6_pkt_policy_failed; + break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; @@ -1521,9 +1928,21 @@ int gwa_type; gw_addr = &cfg->fc_gateway; - rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + err = -EINVAL; + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) + goto out; + + rt->rt6i_gateway = *gw_addr; + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { struct rt6_info *grt; @@ -1534,7 +1953,6 @@ (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK */ - err = -EINVAL; if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; @@ -1583,32 +2001,13 @@ rt->rt6i_flags = cfg->fc_flags; install_route: - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - if (type > RTAX_MAX) { - err = -EINVAL; - goto out; - } - - dst_metric_set(&rt->dst, type, nla_get_u32(nla)); - } - } - } - rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); - return __ip6_ins_rt(rt, &cfg->fc_nlinfo); - + return rt; out: if (dev) dev_put(dev); @@ -1616,6 +2015,39 @@ in6_dev_put(idev); if (rt) dst_free(&rt->dst); + + return ERR_PTR(err); +} + +int ip6_route_add(struct fib6_config *cfg) +{ + struct mx6_config mxc = { .mx = NULL, }; + struct rt6_info *rt; + int err; + + rt = ip6_route_info_create(cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto out; + } + + err = ip6_convert_metrics(&mxc, cfg); + if (err) + goto out; + + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); + if (!err) + atomic_notifier_call_chain(&ip6route_chain, + RTM_NEWROUTE, rt); + + kfree(mxc.mx); + + return err; +out: + if (rt) + dst_free(&rt->dst); + return err; } @@ -1625,7 +2057,8 @@ struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -1635,6 +2068,9 @@ err = fib6_del(rt, info); write_unlock_bh(&table->tb6_lock); + if (!err) + atomic_notifier_call_chain(&ip6route_chain, + RTM_DELROUTE, rt); out: ip6_rt_put(rt); return err; @@ -1667,6 +2103,9 @@ if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if ((rt->rt6i_flags & RTF_CACHE) && + !(cfg->fc_flags & RTF_CACHE)) + continue; if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -1676,8 +2115,6 @@ continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) - continue; dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); @@ -1691,7 +2128,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct ndisc_options ndopts; @@ -1701,7 +2137,7 @@ int optlen, on_link; u8 *lladdr; - optlen = skb->tail - skb->transport_header; + optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { @@ -1752,7 +2188,7 @@ } rt = (struct rt6_info *) dst; - if (rt == net->ipv6.ip6_null_entry) { + if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } @@ -1778,7 +2214,7 @@ NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, &msg->dest); + nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) goto out; @@ -1810,42 +2246,36 @@ * Misc support functions */ -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest) +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); - - if (rt) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->dst.flags |= DST_HOST; - - rt->rt6i_dst.addr = *dest; - rt->rt6i_dst.plen = 128; - dst_copy_metrics(&rt->dst, &ort->dst); - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; + BUG_ON(from->dst.from); - if (ort->rt6i_flags & RTF_GATEWAY) - rt->rt6i_gateway = ort->rt6i_gateway; - else - rt->rt6i_gateway = *dest; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = 0; + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->dst.from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES - memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); + rt->rt6i_src = ort->rt6i_src; #endif - memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); - rt->rt6i_table = ort->rt6i_table; - } - return rt; + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -1862,7 +2292,7 @@ return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); if (!fn) goto out; @@ -1887,7 +2317,6 @@ unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_INFO, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = ifindex, .fc_dst_len = prefixlen, @@ -1898,6 +2327,7 @@ .fc_nlinfo.nl_net = net, }; + cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -1921,7 +2351,7 @@ return NULL; read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -1938,7 +2368,7 @@ unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_DFLT, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | @@ -1985,7 +2415,8 @@ { memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = RT6_TABLE_MAIN; + cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? + : RT6_TABLE_MAIN; cfg->fc_ifindex = rtmsg->rtmsg_ifindex; cfg->fc_metric = rtmsg->rtmsg_metric; cfg->fc_expires = rtmsg->rtmsg_info; @@ -2006,7 +2437,7 @@ struct in6_rtmsg rtmsg; int err; - switch(cmd) { + switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2069,7 +2500,7 @@ return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2080,12 +2511,23 @@ return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } +static int ip6_pkt_policy_failed(struct sk_buff *skb) +{ + return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_INNOROUTES); +} + +static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + skb->dev = skb_dst(skb)->dev; + return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_OUTNOROUTES); +} + /* * Allocate a dst for local (unicast / anycast) address. */ @@ -2094,9 +2536,10 @@ const struct in6_addr *addr, bool anycast) { + u32 tb_id; struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT, NULL); + DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); @@ -2116,7 +2559,9 @@ rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; - rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; + rt->rt6i_table = fib6_get_table(net, tb_id); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); @@ -2129,9 +2574,10 @@ unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; int err = 0; - if (rt->rt6i_prefsrc.plen) + if (rt && rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, @@ -2169,7 +2615,28 @@ .net = net, .addr = &ifp->addr, }; - fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni); + fib6_clean_all(net, fib6_remove_prefsrc, &adni); +} + +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +/* Remove routers and update dst entries when gateway turn into host. */ +static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +{ + struct in6_addr *gateway = (struct in6_addr *)arg; + + if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || + ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + return -1; + } + return 0; +} + +void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) +{ + fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_dev_net { @@ -2196,8 +2663,10 @@ .net = net, }; - fib6_clean_all(net, fib6_ifdown, 0, &adn); + fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); + if (dev) + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -2235,11 +2704,20 @@ PMTU discouvery. */ if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU) && - (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6))) { - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->rt6i_flags & RTF_CACHE) { + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) + rt->rt6i_pmtu = arg->mtu; + } else if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } } return 0; } @@ -2251,7 +2729,7 @@ .mtu = mtu, }; - fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { @@ -2261,6 +2739,9 @@ [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, + [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2268,6 +2749,7 @@ { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + unsigned int pref; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); @@ -2288,18 +2770,22 @@ if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || rtm->rtm_type == RTN_PROHIBIT || - rtm->rtm_type == RTN_THROW) + rtm->rtm_type == RTN_THROW || + rtm->rtm_type == RTN_POLICY_FAILED) cfg->fc_flags |= RTF_REJECT; if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; + if (rtm->rtm_flags & RTM_F_CLONED) + cfg->fc_flags |= RTF_CACHE; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); if (tb[RTA_GATEWAY]) { - nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); + cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } @@ -2322,7 +2808,7 @@ } if (tb[RTA_PREFSRC]) - nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); @@ -2343,24 +2829,97 @@ cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); } + if (tb[RTA_PREF]) { + pref = nla_get_u8(tb[RTA_PREF]); + if (pref != ICMPV6_ROUTER_PREF_LOW && + pref != ICMPV6_ROUTER_PREF_HIGH) + pref = ICMPV6_ROUTER_PREF_MEDIUM; + cfg->fc_flags |= RTF_PREF(pref); + } + + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; } -static int ip6_route_multipath(struct fib6_config *cfg, int add) +struct rt6_nh { + struct rt6_info *rt6_info; + struct fib6_config r_cfg; + struct mx6_config mxc; + struct list_head next; +}; + +static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) +{ + struct rt6_nh *nh; + + list_for_each_entry(nh, rt6_nh_list, next) { + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, + nh->r_cfg.fc_ifindex); + } +} + +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct rt6_info *rt, struct fib6_config *r_cfg) +{ + struct rt6_nh *nh; + struct rt6_info *rtnh; + int err = -EEXIST; + + list_for_each_entry(nh, rt6_nh_list, next) { + /* check if rt6_info already exists */ + rtnh = nh->rt6_info; + + if (rtnh->dst.dev == rt->dst.dev && + rtnh->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&rtnh->rt6i_gateway, + &rt->rt6i_gateway)) + return err; + } + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; + nh->rt6_info = rt; + err = ip6_convert_metrics(&nh->mxc, r_cfg); + if (err) { + kfree(nh); + return err; + } + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->next, rt6_nh_list); + + return 0; +} + +static int ip6_route_multipath_add(struct fib6_config *cfg) { struct fib6_config r_cfg; struct rtnexthop *rtnh; + struct rt6_info *rt; + struct rt6_nh *err_nh; + struct rt6_nh *nh, *nh_safe; int remaining; int attrlen; - int err = 0, last_err = 0; + int err = 1; + int nhn = 0; + int replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); + LIST_HEAD(rt6_nh_list); -beginning: - rtnh = (struct rtnexthop *)cfg->fc_mp; remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; - /* Parse a Multipath Entry */ + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * rt6_info structs per nexthop + */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) @@ -2372,38 +2931,115 @@ nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } - err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + + rt = ip6_route_info_create(&r_cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto cleanup; + } + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - last_err = err; - /* If we are trying to remove a route, do not stop the - * loop when ip6_route_del() fails (because next hop is - * already gone), we should try to remove all next hops. - */ - if (add) { - /* If add fails, we should try to delete all - * next hops that have been already added. - */ - add = 0; - goto beginning; - } + dst_free(&rt->dst); + goto cleanup; } + + rtnh = rtnh_next(rtnh, &remaining); + } + + err_nh = NULL; + list_for_each_entry(nh, &rt6_nh_list, next) { + err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* nh->rt6_info is used or freed at this point, reset to NULL*/ + nh->rt6_info = NULL; + if (err) { + if (replace && nhn) + ip6_print_replace_route_err(&rt6_nh_list); + err_nh = nh; + goto add_errout; + } + /* Because each route is added like a single route we remove - * this flag after the first nexthop (if there is a collision, - * we have already fail to add the first nexthop: - * fib6_add_rt2node() has reject it). + * these flags after the first nexthop: if there is a collision, + * we have already failed to add the first nexthop: + * fib6_add_rt2node() has rejected it; when replacing, old + * nexthops have been replaced by first new, the rest should + * be added to it. */ - cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | + NLM_F_REPLACE); + nhn++; + } + + goto cleanup; + +add_errout: + /* Delete routes that were already added */ + list_for_each_entry(nh, &rt6_nh_list, next) { + if (err_nh == nh) + break; + ip6_route_del(&nh->r_cfg); + } + +cleanup: + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + if (nh->rt6_info) + dst_free(&nh->rt6_info->dst); + kfree(nh->mxc.mx); + list_del(&nh->next); + kfree(nh); + } + + return err; +} + +static int ip6_route_multipath_del(struct fib6_config *cfg) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 1, last_err = 0; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = ip6_route_del(&r_cfg); + if (err) + last_err = err; + rtnh = rtnh_next(rtnh, &remaining); } return last_err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2413,12 +3049,12 @@ return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 0); + return ip6_route_multipath_del(&cfg); else return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2428,12 +3064,12 @@ return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 1); + return ip6_route_multipath_add(&cfg); else return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2445,7 +3081,10 @@ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ - + nla_total_size(sizeof(struct rta_cacheinfo)); + + nla_total_size(sizeof(struct rta_cacheinfo)) + + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->dst.lwtstate); } static int rt6_fill_node(struct net *net, @@ -2454,6 +3093,7 @@ int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { + u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; @@ -2490,6 +3130,9 @@ case -EACCES: rtm->rtm_type = RTN_PROHIBIT; break; + case -EPERM: + rtm->rtm_type = RTN_POLICY_FAILED; + break; case -EAGAIN: rtm->rtm_type = RTN_THROW; break; @@ -2505,6 +3148,11 @@ else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; + if (!netif_carrier_ok(rt->dst.dev)) { + rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + rtm->rtm_flags |= RTNH_F_DEAD; + } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) @@ -2520,19 +3168,19 @@ rtm->rtm_flags |= RTM_F_CLONED; if (dst) { - if (nla_put(skb, RTA_DST, 16, dst)) + if (nla_put_in6_addr(skb, RTA_DST, dst)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { - if (nla_put(skb, RTA_SRC, 16, src)) + if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) goto nla_put_failure; #endif if (iif) { @@ -2558,22 +3206,25 @@ } else if (dst) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && - nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->rt6i_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->rt6i_prefsrc.addr; - if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt6i_pmtu) + metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0) + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) goto nla_put_failure; } @@ -2588,7 +3239,14 @@ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + goto nla_put_failure; + + if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2612,7 +3270,7 @@ prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; @@ -2649,6 +3307,9 @@ if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); + if (tb[RTA_MARK]) + fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); + if (iif) { struct net_device *dev; int flags = 0; @@ -2669,6 +3330,11 @@ } else { fl6.flowi6_oif = oif; + if (netif_index_is_l3_master(net, oif)) { + fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | + FLOWI_FLAG_SKIP_NH_OIF; + } + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } @@ -2700,7 +3366,8 @@ return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -2710,12 +3377,12 @@ err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2731,9 +3398,9 @@ } static int ip6_route_dev_notify(struct notifier_block *this, - unsigned long event, void *data) + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { @@ -2742,6 +3409,8 @@ #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.ip6_policy_failed_entry->dst.dev = dev; + net->ipv6.ip6_policy_failed_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif @@ -2750,62 +3419,30 @@ return NOTIFY_OK; } -/* - * /proc - */ - -#ifdef CONFIG_PROC_FS - -struct rt6_proc_arg +int rt6_register_notifier(struct notifier_block *nb) { - char *buffer; - int offset; - int length; - int skip; - int len; -}; - -static int rt6_info_route(struct rt6_info *rt, void *p_arg) -{ - struct seq_file *m = p_arg; - - seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); - -#ifdef CONFIG_IPV6_SUBTREES - seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); -#else - seq_puts(m, "00000000000000000000000000000000 00 "); -#endif - if (rt->rt6i_flags & RTF_GATEWAY) { - seq_printf(m, "%pi6", &rt->rt6i_gateway); - } else { - seq_puts(m, "00000000000000000000000000000000"); - } - seq_printf(m, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), - rt->dst.__use, rt->rt6i_flags, - rt->dst.dev ? rt->dst.dev->name : ""); - return 0; + return atomic_notifier_chain_register(&ip6route_chain, nb); } +EXPORT_SYMBOL(rt6_register_notifier); -static int ipv6_route_show(struct seq_file *m, void *v) +int rt6_unregister_notifier(struct notifier_block *nb) { - struct net *net = (struct net *)m->private; - fib6_clean_all_ro(net, rt6_info_route, 0, m); - return 0; + return atomic_notifier_chain_unregister(&ip6route_chain, nb); } +EXPORT_SYMBOL(rt6_unregister_notifier); -static int ipv6_route_open(struct inode *inode, struct file *file) -{ - return single_open_net(inode, file, ipv6_route_show); -} +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS static const struct file_operations ipv6_route_proc_fops = { .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release_net, + .release = seq_release_net, }; static int rt6_stats_seq_show(struct seq_file *seq, void *v) @@ -2840,7 +3477,7 @@ #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -2855,7 +3492,7 @@ return 0; } -ctl_table ipv6_route_table_template[] = { +struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, @@ -3002,6 +3639,17 @@ net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + + net->ipv6.ip6_policy_failed_entry = + kmemdup(&ip6_policy_failed_entry_template, + sizeof(*net->ipv6.ip6_policy_failed_entry), GFP_KERNEL); + if (!net->ipv6.ip6_policy_failed_entry) + goto out_ip6_blk_hole_entry; + net->ipv6.ip6_policy_failed_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_policy_failed_entry; + net->ipv6.ip6_policy_failed_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_policy_failed_entry->dst, + ip6_template_metrics, true); #endif net->ipv6.sysctl.flush_delay = 0; @@ -3020,6 +3668,8 @@ return ret; #ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_ip6_blk_hole_entry: + kfree(net->ipv6.ip6_blk_hole_entry); out_ip6_prohibit_entry: kfree(net->ipv6.ip6_prohibit_entry); out_ip6_null_entry: @@ -3037,6 +3687,7 @@ #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); + kfree(net->ipv6.ip6_policy_failed_entry); #endif dst_entries_destroy(&net->ipv6.ip6_dst_ops); } @@ -3101,6 +3752,7 @@ int __init ip6_route_init(void) { int ret; + int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = @@ -3133,6 +3785,9 @@ init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_policy_failed_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_policy_failed_entry->rt6i_idev = + in6_dev_get(init_net.loopback_dev); #endif ret = fib6_init(); if (ret) @@ -3160,6 +3815,13 @@ if (ret) goto out_register_late_subsys; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + + INIT_LIST_HEAD(&ul->head); + spin_lock_init(&ul->lock); + } + out: return ret;