329 lines
11 KiB
Diff
329 lines
11 KiB
Diff
From 232aa8ec3ed979d4716891540c03a806ecab0c37 Mon Sep 17 00:00:00 2001
|
|
From: Priyaranjan Jha <priyarjha@google.com>
|
|
Date: Wed, 23 Jan 2019 12:04:53 -0800
|
|
Subject: tcp_bbr: refactor bbr_target_cwnd() for general inflight provisioning
|
|
|
|
Because bbr_target_cwnd() is really a general-purpose BBR helper for
|
|
computing some volume of inflight data as a function of the estimated
|
|
BDP, refactor it into following helper functions:
|
|
- bbr_bdp()
|
|
- bbr_quantization_budget()
|
|
- bbr_inflight()
|
|
|
|
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
|
|
Signed-off-by: Neal Cardwell <ncardwell@google.com>
|
|
Signed-off-by: Yuchung Cheng <ycheng@google.com>
|
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
---
|
|
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
|
|
--- a/include/net/inet_connection_sock.h
|
|
+++ b/include/net/inet_connection_sock.h
|
|
@@ -141,4 +141,4 @@
|
|
|
|
- u64 icsk_ca_priv[88 / sizeof(u64)];
|
|
-#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64))
|
|
+ u64 icsk_ca_priv[104 / sizeof(u64)];
|
|
+#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64))
|
|
};
|
|
|
|
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
|
|
--- a/net/ipv4/tcp_bbr.c
|
|
+++ b/net/ipv4/tcp_bbr.c
|
|
@@ -117,2 +117,10 @@
|
|
u32 full_bw; /* recent bw, to estimate if pipe is full */
|
|
+
|
|
+ /* For tracking ACK aggregation: */
|
|
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
|
|
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
|
|
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
|
|
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
|
|
+ extra_acked_win_idx:1, /* current index in extra_acked array */
|
|
+ unused_c:6;
|
|
};
|
|
@@ -176,2 +184,11 @@
|
|
|
|
+/* Gain factor for adding extra_acked to target cwnd: */
|
|
+static const int bbr_extra_acked_gain = BBR_UNIT;
|
|
+/* Window length of extra_acked window. */
|
|
+static const u32 bbr_extra_acked_win_rtts = 5;
|
|
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
|
|
+static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
|
|
+/* Time period for clamping cwnd increment due to ack aggregation */
|
|
+static const u32 bbr_extra_acked_max_us = 100 * 1000;
|
|
+
|
|
static void bbr_check_probe_rtt_done(struct sock *sk);
|
|
@@ -202,2 +219,12 @@
|
|
|
|
+/* Return maximum extra acked in past k-2k round trips,
|
|
+ * where k = bbr_extra_acked_win_rtts.
|
|
+ */
|
|
+static u16 bbr_extra_acked(const struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
|
|
+}
|
|
+
|
|
/* Return rate in bytes per second, optionally with a gain.
|
|
@@ -307,2 +338,4 @@
|
|
bbr->idle_restart = 1;
|
|
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
|
|
+ bbr->ack_epoch_acked = 0;
|
|
/* Avoid pointless buffer overflows: pace at est. bw if we don't
|
|
@@ -317,6 +350,5 @@
|
|
|
|
-/* Find target cwnd. Right-size the cwnd based on min RTT and the
|
|
- * estimated bottleneck bandwidth:
|
|
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
|
|
*
|
|
- * cwnd = bw * min_rtt * gain = BDP * gain
|
|
+ * bdp = bw * min_rtt * gain
|
|
*
|
|
@@ -326,17 +358,7 @@
|
|
* noise may cause BBR to under-estimate the rate.
|
|
- *
|
|
- * To achieve full performance in high-speed paths, we budget enough cwnd to
|
|
- * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
|
|
- * - one skb in sending host Qdisc,
|
|
- * - one skb in sending host TSO/GSO engine
|
|
- * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
|
|
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
|
|
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
|
|
- * which allows 2 outstanding 2-packet sequences, to try to keep pipe
|
|
- * full even with ACK-every-other-packet delayed ACKs.
|
|
*/
|
|
-static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
|
|
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
|
|
{
|
|
struct bbr *bbr = inet_csk_ca(sk);
|
|
- u32 cwnd;
|
|
+ u32 bdp;
|
|
u64 w;
|
|
@@ -355,3 +377,20 @@
|
|
/* Apply a gain to the given value, then remove the BW_SCALE shift. */
|
|
- cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
|
|
+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
|
|
+
|
|
+ return bdp;
|
|
+}
|
|
+
|
|
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
|
|
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
|
|
+ * - one skb in sending host Qdisc,
|
|
+ * - one skb in sending host TSO/GSO engine
|
|
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
|
|
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
|
|
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
|
|
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
|
|
+ * full even with ACK-every-other-packet delayed ACKs.
|
|
+ */
|
|
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
|
|
@@ -364,3 +403,3 @@
|
|
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
|
|
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
|
|
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
|
|
cwnd += 2;
|
|
@@ -370,2 +409,29 @@
|
|
|
|
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
|
|
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
|
|
+{
|
|
+ u32 inflight;
|
|
+
|
|
+ inflight = bbr_bdp(sk, bw, gain);
|
|
+ inflight = bbr_quantization_budget(sk, inflight);
|
|
+
|
|
+ return inflight;
|
|
+}
|
|
+
|
|
+/* Find the cwnd increment based on estimate of ack aggregation */
|
|
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
|
|
+{
|
|
+ u32 max_aggr_cwnd, aggr_cwnd = 0;
|
|
+
|
|
+ if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
|
|
+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
|
|
+ / BW_UNIT;
|
|
+ aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
|
|
+ >> BBR_SCALE;
|
|
+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
|
|
+ }
|
|
+
|
|
+ return aggr_cwnd;
|
|
+}
|
|
+
|
|
/* An optimization in BBR to reduce losses: On the first round of recovery, we
|
|
@@ -430,4 +496,11 @@
|
|
|
|
+ target_cwnd = bbr_bdp(sk, bw, gain);
|
|
+
|
|
+ /* Increment the cwnd to account for excess ACKed data that seems
|
|
+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
|
|
+ */
|
|
+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
|
|
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
|
|
+
|
|
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
|
|
- target_cwnd = bbr_target_cwnd(sk, bw, gain);
|
|
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
|
|
@@ -472,3 +545,3 @@
|
|
(rs->losses || /* perhaps pacing_gain*BDP won't fit */
|
|
- inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
|
|
+ inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
|
|
|
|
@@ -479,3 +552,3 @@
|
|
return is_full_length ||
|
|
- inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
|
|
+ inflight <= bbr_inflight(sk, bw, BBR_UNIT);
|
|
}
|
|
@@ -489,4 +562,2 @@
|
|
bbr->cycle_mstamp = tp->delivered_mstamp;
|
|
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
|
|
- bbr_pacing_gain[bbr->cycle_idx];
|
|
}
|
|
@@ -508,4 +579,2 @@
|
|
bbr->mode = BBR_STARTUP;
|
|
- bbr->pacing_gain = bbr_high_gain;
|
|
- bbr->cwnd_gain = bbr_high_gain;
|
|
}
|
|
@@ -517,4 +586,2 @@
|
|
bbr->mode = BBR_PROBE_BW;
|
|
- bbr->pacing_gain = BBR_UNIT;
|
|
- bbr->cwnd_gain = bbr_cwnd_gain;
|
|
bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
|
|
@@ -701,2 +768,63 @@
|
|
|
|
+/* Estimates the windowed max degree of ack aggregation.
|
|
+ * This is used to provision extra in-flight data to keep sending during
|
|
+ * inter-ACK silences.
|
|
+ *
|
|
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
|
|
+ *
|
|
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
|
|
+ * cwnd += max_extra_acked
|
|
+ *
|
|
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
|
|
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
|
|
+ * trips.
|
|
+ */
|
|
+static void bbr_update_ack_aggregation(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ u32 epoch_us, expected_acked, extra_acked;
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
|
|
+ rs->delivered < 0 || rs->interval_us <= 0)
|
|
+ return;
|
|
+
|
|
+ if (bbr->round_start) {
|
|
+ bbr->extra_acked_win_rtts = min(0x1F,
|
|
+ bbr->extra_acked_win_rtts + 1);
|
|
+ if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
|
|
+ bbr->extra_acked_win_rtts = 0;
|
|
+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
|
|
+ 0 : 1;
|
|
+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Compute how many packets we expected to be delivered over epoch. */
|
|
+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
|
|
+ bbr->ack_epoch_mstamp);
|
|
+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
|
|
+
|
|
+ /* Reset the aggregation epoch if ACK rate is below expected rate or
|
|
+ * significantly large no. of ack received since epoch (potentially
|
|
+ * quite old epoch).
|
|
+ */
|
|
+ if (bbr->ack_epoch_acked <= expected_acked ||
|
|
+ (bbr->ack_epoch_acked + rs->acked_sacked >=
|
|
+ bbr_ack_epoch_acked_reset_thresh)) {
|
|
+ bbr->ack_epoch_acked = 0;
|
|
+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
|
|
+ expected_acked = 0;
|
|
+ }
|
|
+
|
|
+ /* Compute excess data delivered, beyond what was expected. */
|
|
+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
|
|
+ bbr->ack_epoch_acked + rs->acked_sacked);
|
|
+ extra_acked = bbr->ack_epoch_acked - expected_acked;
|
|
+ extra_acked = min(extra_acked, tp->snd_cwnd);
|
|
+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
|
|
+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
|
|
+}
|
|
+
|
|
/* Estimate when the pipe is full, using the change in delivery rate: BBR
|
|
@@ -735,6 +863,4 @@
|
|
bbr->mode = BBR_DRAIN; /* drain queue we created */
|
|
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
|
|
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
|
|
tcp_sk(sk)->snd_ssthresh =
|
|
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
|
|
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
|
|
} /* fall through to check if in-flight is already small: */
|
|
@@ -742,3 +868,3 @@
|
|
tcp_packets_in_flight(tcp_sk(sk)) <=
|
|
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
|
|
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
|
|
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
|
|
@@ -798,4 +924,2 @@
|
|
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
|
|
- bbr->pacing_gain = BBR_UNIT;
|
|
- bbr->cwnd_gain = BBR_UNIT;
|
|
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
|
|
@@ -827,2 +951,31 @@
|
|
|
|
+static void bbr_update_gains(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ switch (bbr->mode) {
|
|
+ case BBR_STARTUP:
|
|
+ bbr->pacing_gain = bbr_high_gain;
|
|
+ bbr->cwnd_gain = bbr_high_gain;
|
|
+ break;
|
|
+ case BBR_DRAIN:
|
|
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
|
|
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
|
|
+ break;
|
|
+ case BBR_PROBE_BW:
|
|
+ bbr->pacing_gain = (bbr->lt_use_bw ?
|
|
+ BBR_UNIT :
|
|
+ bbr_pacing_gain[bbr->cycle_idx]);
|
|
+ bbr->cwnd_gain = bbr_cwnd_gain;
|
|
+ break;
|
|
+ case BBR_PROBE_RTT:
|
|
+ bbr->pacing_gain = BBR_UNIT;
|
|
+ bbr->cwnd_gain = BBR_UNIT;
|
|
+ break;
|
|
+ default:
|
|
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
|
|
@@ -830,2 +983,3 @@
|
|
bbr_update_bw(sk, rs);
|
|
+ bbr_update_ack_aggregation(sk, rs);
|
|
bbr_update_cycle_phase(sk, rs);
|
|
@@ -834,2 +988,3 @@
|
|
bbr_update_min_rtt(sk, rs);
|
|
+ bbr_update_gains(sk);
|
|
}
|
|
@@ -880,2 +1035,9 @@
|
|
|
|
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
|
|
+ bbr->ack_epoch_acked = 0;
|
|
+ bbr->extra_acked_win_rtts = 0;
|
|
+ bbr->extra_acked_win_idx = 0;
|
|
+ bbr->extra_acked[0] = 0;
|
|
+ bbr->extra_acked[1] = 0;
|
|
+
|
|
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
|