#if defined(CONFIG_BCM_KF_MPTCP) && defined(CONFIG_BCM_MPTCP) // SPDX-License-Identifier: GPL-2.0 /* MPTCP Scheduler to reduce HoL-blocking and spurious retransmissions. * * Algorithm Design: * Simone Ferlin * Ozgu Alay * Olivier Mehani * Roksana Boreli * * Initial Implementation: * Simone Ferlin * * Additional Authors: * Daniel Weber * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include #include #include static unsigned char lambda __read_mostly = 12; module_param(lambda, byte, 0644); MODULE_PARM_DESC(lambda, "Divided by 10 for scaling factor of fast flow rate estimation"); static unsigned char max_lambda __read_mostly = 13; module_param(max_lambda, byte, 0644); MODULE_PARM_DESC(max_lambda, "Divided by 10 for maximum scaling factor of fast flow rate estimation"); static unsigned char min_lambda __read_mostly = 10; module_param(min_lambda, byte, 0644); MODULE_PARM_DESC(min_lambda, "Divided by 10 for minimum scaling factor of fast flow rate estimation"); static unsigned char dyn_lambda_good = 10; /* 1% */ module_param(dyn_lambda_good, byte, 0644); MODULE_PARM_DESC(dyn_lambda_good, "Decrease of lambda in positive case."); static unsigned char dyn_lambda_bad = 40; /* 4% */ module_param(dyn_lambda_bad, byte, 0644); MODULE_PARM_DESC(dyn_lambda_bad, "Increase of lambda in negative case."); struct blestsched_priv { u32 last_rbuf_opti; u32 min_srtt_us; u32 max_srtt_us; }; struct blestsched_cb { bool retrans_flag; s16 lambda_1000; /* values range from min_lambda * 100 to max_lambda * 100 */ u32 last_lambda_update; }; static struct blestsched_priv *blestsched_get_priv(const struct tcp_sock *tp) { return (struct blestsched_priv *)&tp->mptcp->mptcp_sched[0]; } static struct blestsched_cb *blestsched_get_cb(const struct tcp_sock *tp) { return (struct blestsched_cb *)&tp->mpcb->mptcp_sched[0]; } static void blestsched_update_lambda(struct sock *meta_sk, struct sock *sk) { struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(meta_sk)); struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); if (tcp_jiffies32 - blest_cb->last_lambda_update < usecs_to_jiffies(blest_p->min_srtt_us >> 3)) return; /* if there have been retransmissions of packets of the slow flow * during the slow flows last RTT => increase lambda * otherwise decrease */ if (blest_cb->retrans_flag) { /* need to slow down on the slow flow */ blest_cb->lambda_1000 += dyn_lambda_bad; } else { /* use the slow flow more */ blest_cb->lambda_1000 -= dyn_lambda_good; } blest_cb->retrans_flag = false; /* cap lambda_1000 to its value range */ blest_cb->lambda_1000 = min_t(s16, blest_cb->lambda_1000, max_lambda * 100); blest_cb->lambda_1000 = max_t(s16, blest_cb->lambda_1000, min_lambda * 100); blest_cb->last_lambda_update = tcp_jiffies32; } /* how many bytes will sk send during the rtt of another, slower flow? */ static u32 blestsched_estimate_bytes(struct sock *sk, u32 time_8) { struct tcp_sock *tp = tcp_sk(sk); struct blestsched_priv *blest_p = blestsched_get_priv(tp); struct blestsched_cb *blest_cb = blestsched_get_cb(mptcp_meta_tp(tp)); u32 avg_rtt, num_rtts, ca_cwnd, packets; avg_rtt = (blest_p->min_srtt_us + blest_p->max_srtt_us) / 2; if (avg_rtt == 0) num_rtts = 1; /* sanity */ else num_rtts = (time_8 / avg_rtt) + 1; /* round up */ /* during num_rtts, how many bytes will be sent on the flow? * assumes for simplification that Reno is applied as congestion-control */ if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { /* we are in initial slow start */ if (num_rtts > 16) num_rtts = 16; /* cap for sanity */ packets = tp->snd_cwnd * ((1 << num_rtts) - 1); /* cwnd + 2*cwnd + 4*cwnd */ } else { ca_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh + 1); /* assume we jump to CA already */ packets = (ca_cwnd + (num_rtts - 1) / 2) * num_rtts; } return div_u64(((u64)packets) * tp->mss_cache * blest_cb->lambda_1000, 1000); } static u32 blestsched_estimate_linger_time(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct blestsched_priv *blest_p = blestsched_get_priv(tp); u32 estimate, slope, inflight, cwnd; inflight = tcp_packets_in_flight(tp) + 1; /* take into account the new one */ cwnd = tp->snd_cwnd; if (inflight >= cwnd) { estimate = blest_p->max_srtt_us; } else { slope = blest_p->max_srtt_us - blest_p->min_srtt_us; if (cwnd == 0) cwnd = 1; /* sanity */ estimate = blest_p->min_srtt_us + (slope * inflight) / cwnd; } return (tp->srtt_us > estimate) ? tp->srtt_us : estimate; } /* This is the BLEST scheduler. This function decides on which flow to send * a given MSS. If all subflows are found to be busy or the currently best * subflow is estimated to possibly cause HoL-blocking, NULL is returned. */ struct sock *blest_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, bool zero_wnd_test) { struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; struct sock *bestsk, *minsk = NULL; struct tcp_sock *meta_tp, *besttp; struct mptcp_tcp_sock *mptcp; struct blestsched_priv *blest_p; u32 min_srtt = U32_MAX; /* Answer data_fin on same subflow!!! */ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && skb && mptcp_is_data_fin(skb)) { mptcp_for_each_sub(mpcb, mptcp) { bestsk = mptcp_to_sock(mptcp); if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index && mptcp_is_available(bestsk, skb, zero_wnd_test)) return bestsk; } } /* First, find the overall best subflow */ mptcp_for_each_sub(mpcb, mptcp) { bestsk = mptcp_to_sock(mptcp); besttp = tcp_sk(bestsk); blest_p = blestsched_get_priv(besttp); /* Set of states for which we are allowed to send data */ if (!mptcp_sk_can_send(bestsk)) continue; /* We do not send data on this subflow unless it is * fully established, i.e. the 4th ack has been received. */ if (besttp->mptcp->pre_established) continue; blest_p->min_srtt_us = min(blest_p->min_srtt_us, besttp->srtt_us); blest_p->max_srtt_us = max(blest_p->max_srtt_us, besttp->srtt_us); /* record minimal rtt */ if (besttp->srtt_us < min_srtt) { min_srtt = besttp->srtt_us; minsk = bestsk; } } /* find the current best subflow according to the default scheduler */ bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test); /* if we decided to use a slower flow, we have the option of not using it at all */ if (bestsk && minsk && bestsk != minsk) { u32 slow_linger_time, fast_bytes, slow_inflight_bytes, slow_bytes, avail_space; u32 buffered_bytes = 0; meta_tp = tcp_sk(meta_sk); besttp = tcp_sk(bestsk); blestsched_update_lambda(meta_sk, bestsk); /* if we send this SKB now, it will be acked in besttp->srtt seconds * during this time: how many bytes will we send on the fast flow? */ slow_linger_time = blestsched_estimate_linger_time(bestsk); fast_bytes = blestsched_estimate_bytes(minsk, slow_linger_time); if (skb) buffered_bytes = skb->len; /* is the required space available in the mptcp meta send window? * we assume that all bytes inflight on the slow path will be acked in besttp->srtt seconds * (just like the SKB if it was sent now) -> that means that those inflight bytes will * keep occupying space in the meta window until then */ slow_inflight_bytes = besttp->write_seq - besttp->snd_una; slow_bytes = buffered_bytes + slow_inflight_bytes; // bytes of this SKB plus those in flight already avail_space = (slow_bytes < meta_tp->snd_wnd) ? (meta_tp->snd_wnd - slow_bytes) : 0; if (fast_bytes > avail_space) { /* sending this SKB on the slow flow means * we wouldn't be able to send all the data we'd like to send on the fast flow * so don't do that */ return NULL; } } return bestsk; } /* copy from mptcp_sched.c: mptcp_rcv_buf_optimization */ static struct sk_buff *mptcp_blest_rcv_buf_optimization(struct sock *sk, int penal) { struct sock *meta_sk; const struct tcp_sock *tp = tcp_sk(sk); struct mptcp_tcp_sock *mptcp; struct sk_buff *skb_head; struct blestsched_priv *blest_p = blestsched_get_priv(tp); struct blestsched_cb *blest_cb; meta_sk = mptcp_meta_sk(sk); skb_head = tcp_rtx_queue_head(meta_sk); if (!skb_head) return NULL; /* If penalization is optional (coming from mptcp_next_segment() and * We are not send-buffer-limited we do not penalize. The retransmission * is just an optimization to fix the idle-time due to the delay before * we wake up the application. */ if (!penal && sk_stream_memory_free(meta_sk)) goto retrans; /* Record the occurrence of a retransmission to update the lambda value */ blest_cb = blestsched_get_cb(tcp_sk(meta_sk)); blest_cb->retrans_flag = true; /* Only penalize again after an RTT has elapsed */ if (tcp_jiffies32 - blest_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) goto retrans; /* Half the cwnd of the slow flows */ mptcp_for_each_sub(tp->mpcb, mptcp) { struct tcp_sock *tp_it = mptcp->tp; if (tp_it != tp && TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { u32 prior_cwnd = tp_it->snd_cwnd; tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); /* If in slow start, do not reduce the ssthresh */ if (prior_cwnd >= tp_it->snd_ssthresh) tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); blest_p->last_rbuf_opti = tcp_jiffies32; } } } retrans: /* Segment not yet injected into this path? Take it!!! */ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { bool do_retrans = false; mptcp_for_each_sub(tp->mpcb, mptcp) { struct tcp_sock *tp_it = mptcp->tp; if (tp_it != tp && TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { if (tp_it->snd_cwnd <= 4) { do_retrans = true; break; } if (4 * tp->srtt_us >= tp_it->srtt_us) { do_retrans = false; break; } else { do_retrans = true; } } } if (do_retrans && mptcp_is_available(sk, skb_head, false)) { trace_mptcp_retransmit(sk, skb_head); return skb_head; } } return NULL; } /* copy from mptcp_sched.c: __mptcp_next_segment */ /* Returns the next segment to be sent from the mptcp meta-queue. * (chooses the reinject queue if any segment is waiting in it, otherwise, * chooses the normal write queue). * Sets *@reinject to 1 if the returned segment comes from the * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, * and sets it to -1 if it is a meta-level retransmission to optimize the * receive-buffer. */ static struct sk_buff *__mptcp_blest_next_segment(struct sock *meta_sk, int *reinject) { const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; struct sk_buff *skb = NULL; *reinject = 0; /* If we are in fallback-mode, just take from the meta-send-queue */ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) return tcp_send_head(meta_sk); skb = skb_peek(&mpcb->reinject_queue); if (skb) { *reinject = 1; } else { skb = tcp_send_head(meta_sk); if (!skb && meta_sk->sk_socket && test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { struct sock *subsk = blest_get_available_subflow(meta_sk, NULL, false); if (!subsk) return NULL; skb = mptcp_blest_rcv_buf_optimization(subsk, 0); if (skb) *reinject = -1; } } return skb; } /* copy from mptcp_sched.c: mptcp_next_segment */ static struct sk_buff *mptcp_blest_next_segment(struct sock *meta_sk, int *reinject, struct sock **subsk, unsigned int *limit) { struct sk_buff *skb = __mptcp_blest_next_segment(meta_sk, reinject); unsigned int mss_now; struct tcp_sock *subtp; u16 gso_max_segs; u32 max_len, max_segs, window, needed; /* As we set it, we have to reset it as well. */ *limit = 0; if (!skb) return NULL; *subsk = blest_get_available_subflow(meta_sk, skb, false); if (!*subsk) return NULL; subtp = tcp_sk(*subsk); mss_now = tcp_current_mss(*subsk); if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { skb = mptcp_blest_rcv_buf_optimization(*subsk, 1); if (skb) *reinject = -1; else return NULL; } /* No splitting required, as we will only send one single segment */ if (skb->len <= mss_now) return skb; /* The following is similar to tcp_mss_split_point, but * we do not care about nagle, because we will anyways * use TCP_NAGLE_PUSH, which overrides this. * * So, we first limit according to the cwnd/gso-size and then according * to the subflow's window. */ gso_max_segs = (*subsk)->sk_gso_max_segs; if (!gso_max_segs) /* No gso supported on the subflow's NIC */ gso_max_segs = 1; max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); if (!max_segs) return NULL; max_len = mss_now * max_segs; window = tcp_wnd_end(subtp) - subtp->write_seq; needed = min(skb->len, window); if (max_len <= skb->len) /* Take max_win, which is actually the cwnd/gso-size */ *limit = max_len; else /* Or, take the window */ *limit = needed; return skb; } static void blestsched_init(struct sock *sk) { struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(mptcp_meta_sk(sk))); blest_p->last_rbuf_opti = tcp_jiffies32; blest_p->min_srtt_us = U32_MAX; blest_p->max_srtt_us = 0; if (!blest_cb->lambda_1000) { blest_cb->lambda_1000 = lambda * 100; blest_cb->last_lambda_update = tcp_jiffies32; } } static struct mptcp_sched_ops mptcp_sched_blest = { .get_subflow = blest_get_available_subflow, .next_segment = mptcp_blest_next_segment, .init = blestsched_init, .name = "blest", .owner = THIS_MODULE, }; static int __init blest_register(void) { BUILD_BUG_ON(sizeof(struct blestsched_priv) > MPTCP_SCHED_SIZE); BUILD_BUG_ON(sizeof(struct blestsched_cb) > MPTCP_SCHED_DATA_SIZE); if (mptcp_register_scheduler(&mptcp_sched_blest)) return -1; return 0; } static void blest_unregister(void) { mptcp_unregister_scheduler(&mptcp_sched_blest); } module_init(blest_register); module_exit(blest_unregister); MODULE_AUTHOR("Simone Ferlin, Daniel Weber"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("BLEST scheduler for MPTCP, based on default minimum RTT scheduler"); MODULE_VERSION("0.95"); #endif