Skip to content

Commit 0590bf6

Browse files
dmfreemonhmtheboy154
authored andcommitted
Re-apply: tcp: enforce receive buffer memory limits by allowing the tcp window to shrink
Due to a mistake when merging Signed-off-by: hmtheboy154 <buingoc67@gmail.com>
1 parent bee85bb commit 0590bf6

4 files changed

Lines changed: 69 additions & 9 deletions

File tree

Documentation/networking/ip-sysctl.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,21 @@ tcp_tw_reuse - INTEGER
967967
tcp_window_scaling - BOOLEAN
968968
Enable window scaling as defined in RFC1323.
969969

970+
tcp_shrink_window - BOOLEAN
971+
This changes how the TCP receive window is calculated.
972+
973+
RFC 7323, section 2.4, says there are instances when a retracted
974+
window can be offered, and that TCP implementations MUST ensure
975+
that they handle a shrinking window, as specified in RFC 1122.
976+
977+
- 0 - Disabled. The window is never shrunk.
978+
- 1 - Enabled. The window is shrunk when necessary to remain within
979+
the memory limit set by autotuning (sk_rcvbuf).
980+
This only occurs if a non-zero receive window
981+
scaling factor is also in effect.
982+
983+
Default: 0
984+
970985
tcp_wmem - vector of 3 INTEGERs: min, default, max
971986
min: Amount of memory reserved for send buffers for TCP sockets.
972987
Each TCP socket has rights to use it due to fact of its birth.

include/net/netns/ipv4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct netns_ipv4 {
6565
#endif
6666
bool fib_has_custom_local_routes;
6767
bool fib_offload_disabled;
68+
u8 sysctl_tcp_shrink_window;
6869
#ifdef CONFIG_IP_ROUTE_CLASSID
6970
atomic_t fib_num_tclassid_users;
7071
#endif

net/ipv4/tcp_ipv4.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3230,6 +3230,8 @@ static int __net_init tcp_sk_init(struct net *net)
32303230
else
32313231
net->ipv4.tcp_congestion_control = &tcp_reno;
32323232

3233+
net->ipv4.sysctl_tcp_shrink_window = 0;
3234+
32333235
return 0;
32343236
}
32353237

net/ipv4/tcp_output.c

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,8 @@ static u16 tcp_select_window(struct sock *sk)
259259
u32 old_win = tp->rcv_wnd;
260260
u32 cur_win = tcp_receive_window(tp);
261261
u32 new_win = __tcp_select_window(sk);
262+
struct net *net = sock_net(sk);
262263

263-
/* Never shrink the offered window */
264264
if (new_win < cur_win) {
265265
/* Danger Will Robinson!
266266
* Don't update rcv_wup/rcv_wnd here or else
@@ -269,19 +269,22 @@ static u16 tcp_select_window(struct sock *sk)
269269
*
270270
* Relax Will Robinson.
271271
*/
272-
if (new_win == 0)
273-
NET_INC_STATS(sock_net(sk),
274-
LINUX_MIB_TCPWANTZEROWINDOWADV);
275-
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
272+
if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
273+
/* Never shrink the offered window */
274+
if (new_win == 0)
275+
NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
276+
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
277+
}
276278
}
279+
277280
tp->rcv_wnd = new_win;
278281
tp->rcv_wup = tp->rcv_nxt;
279282

280283
/* Make sure we do not exceed the maximum possible
281284
* scaled window.
282285
*/
283286
if (!tp->rx_opt.rcv_wscale &&
284-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
287+
READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
285288
new_win = min(new_win, MAX_TCP_WINDOW);
286289
else
287290
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -293,10 +296,9 @@ static u16 tcp_select_window(struct sock *sk)
293296
if (new_win == 0) {
294297
tp->pred_flags = 0;
295298
if (old_win)
296-
NET_INC_STATS(sock_net(sk),
297-
LINUX_MIB_TCPTOZEROWINDOWADV);
299+
NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
298300
} else if (old_win == 0) {
299-
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
301+
NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
300302
}
301303

302304
return new_win;
@@ -2990,6 +2992,7 @@ u32 __tcp_select_window(struct sock *sk)
29902992
{
29912993
struct inet_connection_sock *icsk = inet_csk(sk);
29922994
struct tcp_sock *tp = tcp_sk(sk);
2995+
struct net *net = sock_net(sk);
29932996
/* MSS for the peer's data. Previous versions used mss_clamp
29942997
* here. I don't know if the value based on our guesses
29952998
* of peer's MSS is better for the performance. It's more correct
@@ -3011,6 +3014,15 @@ u32 __tcp_select_window(struct sock *sk)
30113014
if (mss <= 0)
30123015
return 0;
30133016
}
3017+
3018+
/* Only allow window shrink if the sysctl is enabled and we have
3019+
* a non-zero scaling factor in effect.
3020+
*/
3021+
if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
3022+
goto shrink_window_allowed;
3023+
3024+
/* do not allow window to shrink */
3025+
30143026
if (free_space < (full_space >> 1)) {
30153027
icsk->icsk_ack.quick = 0;
30163028

@@ -3065,6 +3077,36 @@ u32 __tcp_select_window(struct sock *sk)
30653077
}
30663078

30673079
return window;
3080+
3081+
shrink_window_allowed:
3082+
/* new window should always be an exact multiple of scaling factor */
3083+
free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
3084+
3085+
if (free_space < (full_space >> 1)) {
3086+
icsk->icsk_ack.quick = 0;
3087+
3088+
if (tcp_under_memory_pressure(sk))
3089+
tcp_adjust_rcv_ssthresh(sk);
3090+
3091+
/* if free space is too low, return a zero window */
3092+
if (free_space < (allowed_space >> 4) || free_space < mss ||
3093+
free_space < (1 << tp->rx_opt.rcv_wscale))
3094+
return 0;
3095+
}
3096+
3097+
if (free_space > tp->rcv_ssthresh) {
3098+
free_space = tp->rcv_ssthresh;
3099+
/* new window should always be an exact multiple of scaling factor
3100+
*
3101+
* For this case, we ALIGN "up" (increase free_space) because
3102+
* we know free_space is not zero here, it has been reduced from
3103+
* the memory-based limit, and rcv_ssthresh is not a hard limit
3104+
* (unlike sk_rcvbuf).
3105+
*/
3106+
free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
3107+
}
3108+
3109+
return free_space;
30683110
}
30693111

30703112
void tcp_skb_collapse_tstamp(struct sk_buff *skb,

0 commit comments

Comments
 (0)