#if defined(CONFIG_BCM_MPTCP) && defined(CONFIG_BCM_KF_MPTCP) /* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ #include #include static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); struct defsched_priv { u32 last_rbuf_opti; }; static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) { return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; } bool mptcp_is_def_unavailable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Set of states for which we are allowed to send data */ if (!mptcp_sk_can_send(sk)) return true; /* We do not send data on this subflow unless it is * fully established, i.e. the 4th ack has been received. */ if (tp->mptcp->pre_established) return true; if (tp->pf) return true; return false; } EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable); static bool mptcp_is_temp_unavailable(struct sock *sk, const struct sk_buff *skb, bool zero_wnd_test) { const struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now, space, in_flight; if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { /* If SACK is disabled, and we got a loss, TCP does not exit * the loss-state until something above high_seq has been * acked. (see tcp_try_undo_recovery) * * high_seq is the snd_nxt at the moment of the RTO. As soon * as we have an RTO, we won't push data on the subflow. * Thus, snd_una can never go beyond high_seq. */ if (!tcp_is_reno(tp)) return true; else if (tp->snd_una != tp->high_seq) return true; } if (!tp->mptcp->fully_established) { /* Make sure that we send in-order data */ if (skb && tp->mptcp->second_packet && tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) return true; } /* If TSQ is already throttling us, do not send on this subflow. When * TSQ gets cleared the subflow becomes eligible again. */ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags)) return true; in_flight = tcp_packets_in_flight(tp); /* Not even a single spot in the cwnd */ if (in_flight >= tp->snd_cwnd) return true; /* Now, check if what is queued in the subflow's send-queue * already fills the cwnd. */ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; if (tp->write_seq - tp->snd_nxt > space) return true; if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) return true; mss_now = tcp_current_mss(sk); /* Don't send on this subflow if we bypass the allowed send-window at * the per-subflow level. Similar to tcp_snd_wnd_test, but manually * calculated end_seq (because here at this point end_seq is still at * the meta-level). */ if (skb && !zero_wnd_test && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) return true; return false; } /* Is the sub-socket sk available to send the skb? */ bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, bool zero_wnd_test) { return !mptcp_is_def_unavailable(sk) && !mptcp_is_temp_unavailable(sk, skb, zero_wnd_test); } EXPORT_SYMBOL_GPL(mptcp_is_available); /* Are we not allowed to reinject this skb on tp? */ static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) { /* If the skb has already been enqueued in this sk, try to find * another one. */ return skb && /* Has the skb already been enqueued into this subsocket? */ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; } bool subflow_is_backup(const struct tcp_sock *tp) { return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio; } EXPORT_SYMBOL_GPL(subflow_is_backup); bool subflow_is_active(const struct tcp_sock *tp) { return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio; } EXPORT_SYMBOL_GPL(subflow_is_active); /* Generic function to iterate over used and unused subflows and to select the * best one */ static struct sock *get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb, bool (*selector)(const struct tcp_sock *), bool zero_wnd_test, bool *force) { struct sock *bestsk = NULL; u32 min_srtt = 0xffffffff; bool found_unused = false; bool found_unused_una = false; struct sock *sk; mptcp_for_each_sk(mpcb, sk) { struct tcp_sock *tp = tcp_sk(sk); bool unused = false; /* First, we choose only the wanted sks */ if (!(*selector)(tp)) continue; if (!mptcp_dont_reinject_skb(tp, skb)) unused = true; else if (found_unused) /* If a unused sk was found previously, we continue - * no need to check used sks anymore. */ continue; if (mptcp_is_def_unavailable(sk)) continue; if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) { if (unused) found_unused_una = true; continue; } if (unused) { if (!found_unused) { /* It's the first time we encounter an unused * sk - thus we reset the bestsk (which might * have been set to a used sk). */ min_srtt = 0xffffffff; bestsk = NULL; } found_unused = true; } if (tp->srtt_us < min_srtt) { min_srtt = tp->srtt_us; bestsk = sk; } } if (bestsk) { /* The force variable is used to mark the returned sk as * previously used or not-used. */ if (found_unused) *force = true; else *force = false; } else { /* The force variable is used to mark if there are temporally * unavailable not-used sks. */ if (found_unused_una) *force = true; else *force = false; } return bestsk; } /* This is the scheduler. This function decides on which flow to send * a given MSS. If all subflows are found to be busy, NULL is returned * The flow is selected based on the shortest RTT. * If all paths have full cong windows, we simply return NULL. * * Additionally, this function is aware of the backup-subflows. */ struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, bool zero_wnd_test) { struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; struct sock *sk; bool force; /* if there is only one subflow, bypass the scheduling function */ if (mpcb->cnt_subflows == 1) { sk = (struct sock *)mpcb->connection_list; if (!mptcp_is_available(sk, skb, zero_wnd_test)) sk = NULL; return sk; } /* Answer data_fin on same subflow!!! */ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && skb && mptcp_is_data_fin(skb)) { mptcp_for_each_sk(mpcb, sk) { if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && mptcp_is_available(sk, skb, zero_wnd_test)) return sk; } } /* Find the best subflow */ sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active, zero_wnd_test, &force); if (force) /* one unused active sk or one NULL sk when there is at least * one temporally unavailable unused active sk */ return sk; sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup, zero_wnd_test, &force); if (!force && skb) /* one used backup sk or one NULL sk where there is no one * temporally unavailable unused backup sk * * the skb passed through all the available active and backups * sks, so clean the path mask */ TCP_SKB_CB(skb)->path_mask = 0; return sk; } EXPORT_SYMBOL_GPL(get_available_subflow); static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) { struct sock *meta_sk; const struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp_it; struct sk_buff *skb_head; struct defsched_priv *dsp = defsched_get_priv(tp); if (tp->mpcb->cnt_subflows == 1) return NULL; meta_sk = mptcp_meta_sk(sk); skb_head = tcp_write_queue_head(meta_sk); if (!skb_head || skb_head == tcp_send_head(meta_sk)) return NULL; /* If penalization is optional (coming from mptcp_next_segment() and * We are not send-buffer-limited we do not penalize. The retransmission * is just an optimization to fix the idle-time due to the delay before * we wake up the application. */ if (!penal && sk_stream_memory_free(meta_sk)) goto retrans; /* Only penalize again after an RTT has elapsed */ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) goto retrans; /* Half the cwnd of the slow flow */ mptcp_for_each_tp(tp->mpcb, tp_it) { if (tp_it != tp && TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { u32 prior_cwnd = tp_it->snd_cwnd; tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); /* If in slow start, do not reduce the ssthresh */ if (prior_cwnd >= tp_it->snd_ssthresh) tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); dsp->last_rbuf_opti = tcp_time_stamp; } break; } } retrans: /* Segment not yet injected into this path? Take it!!! */ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { bool do_retrans = false; mptcp_for_each_tp(tp->mpcb, tp_it) { if (tp_it != tp && TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { if (tp_it->snd_cwnd <= 4) { do_retrans = true; break; } if (4 * tp->srtt_us >= tp_it->srtt_us) { do_retrans = false; break; } else { do_retrans = true; } } } if (do_retrans && mptcp_is_available(sk, skb_head, false)) return skb_head; } return NULL; } /* Returns the next segment to be sent from the mptcp meta-queue. * (chooses the reinject queue if any segment is waiting in it, otherwise, * chooses the normal write queue). * Sets *@reinject to 1 if the returned segment comes from the * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, * and sets it to -1 if it is a meta-level retransmission to optimize the * receive-buffer. */ static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) { const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; struct sk_buff *skb = NULL; *reinject = 0; /* If we are in fallback-mode, just take from the meta-send-queue */ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) return tcp_send_head(meta_sk); skb = skb_peek(&mpcb->reinject_queue); if (skb) { *reinject = 1; } else { skb = tcp_send_head(meta_sk); if (!skb && meta_sk->sk_socket && test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { struct sock *subsk = get_available_subflow(meta_sk, NULL, false); if (!subsk) return NULL; skb = mptcp_rcv_buf_optimization(subsk, 0); if (skb) *reinject = -1; } } return skb; } static struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject, struct sock **subsk, unsigned int *limit) { struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); unsigned int mss_now; struct tcp_sock *subtp; u16 gso_max_segs; u32 max_len, max_segs, window, needed; /* As we set it, we have to reset it as well. */ *limit = 0; if (!skb) return NULL; *subsk = get_available_subflow(meta_sk, skb, false); if (!*subsk) return NULL; subtp = tcp_sk(*subsk); mss_now = tcp_current_mss(*subsk); if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { skb = mptcp_rcv_buf_optimization(*subsk, 1); if (skb) *reinject = -1; else return NULL; } /* No splitting required, as we will only send one single segment */ if (skb->len <= mss_now) return skb; /* The following is similar to tcp_mss_split_point, but * we do not care about nagle, because we will anyways * use TCP_NAGLE_PUSH, which overrides this. * * So, we first limit according to the cwnd/gso-size and then according * to the subflow's window. */ gso_max_segs = (*subsk)->sk_gso_max_segs; if (!gso_max_segs) /* No gso supported on the subflow's NIC */ gso_max_segs = 1; max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); if (!max_segs) return NULL; max_len = mss_now * max_segs; window = tcp_wnd_end(subtp) - subtp->write_seq; needed = min(skb->len, window); if (max_len <= skb->len) /* Take max_win, which is actually the cwnd/gso-size */ *limit = max_len; else /* Or, take the window */ *limit = needed; return skb; } static void defsched_init(struct sock *sk) { struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk)); dsp->last_rbuf_opti = tcp_time_stamp; } struct mptcp_sched_ops mptcp_sched_default = { .get_subflow = get_available_subflow, .next_segment = mptcp_next_segment, .init = defsched_init, .name = "default", .owner = THIS_MODULE, }; static struct mptcp_sched_ops *mptcp_sched_find(const char *name) { struct mptcp_sched_ops *e; list_for_each_entry_rcu(e, &mptcp_sched_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { int ret = 0; if (!sched->get_subflow || !sched->next_segment) return -EINVAL; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { pr_notice("%s already registered\n", sched->name); ret = -EEXIST; } else { list_add_tail_rcu(&sched->list, &mptcp_sched_list); pr_info("%s registered\n", sched->name); } spin_unlock(&mptcp_sched_list_lock); return ret; } EXPORT_SYMBOL_GPL(mptcp_register_scheduler); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler); void mptcp_get_default_scheduler(char *name) { struct mptcp_sched_ops *sched; BUG_ON(list_empty(&mptcp_sched_list)); rcu_read_lock(); sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list); strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX); rcu_read_unlock(); } int mptcp_set_default_scheduler(const char *name) { struct mptcp_sched_ops *sched; int ret = -ENOENT; spin_lock(&mptcp_sched_list_lock); sched = mptcp_sched_find(name); #ifdef CONFIG_MODULES if (!sched && capable(CAP_NET_ADMIN)) { spin_unlock(&mptcp_sched_list_lock); request_module("mptcp_%s", name); spin_lock(&mptcp_sched_list_lock); sched = mptcp_sched_find(name); } #endif if (sched) { list_move(&sched->list, &mptcp_sched_list); ret = 0; } else { pr_info("%s is not available\n", name); } spin_unlock(&mptcp_sched_list_lock); return ret; } void mptcp_init_scheduler(struct mptcp_cb *mpcb) { struct mptcp_sched_ops *sched; rcu_read_lock(); list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { if (try_module_get(sched->owner)) { mpcb->sched_ops = sched; break; } } rcu_read_unlock(); } /* Manage refcounts on socket close. */ void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb) { module_put(mpcb->sched_ops->owner); } /* Set default value from kernel configuration at bootup */ static int __init mptcp_scheduler_default(void) { BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED); } late_initcall(mptcp_scheduler_default); #endif