From e644a6ab6201694e9e660ea2505d876bae38fe45 Mon Sep 17 00:00:00 2001 From: dormando Date: Tue, 12 Feb 2013 20:41:57 -0800 Subject: [PATCH 01/10] initcwnd from userspace tunable --- include/net/tcp.h | 1 + include/uapi/linux/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp.c | 18 ++++++++++++++++++ net/ipv4/tcp_output.c | 1 + 5 files changed, 28 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f0d8479e15f08..e1a4af82199491 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -283,6 +283,7 @@ extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; +extern int sysctl_tcp_user_cwnd_max; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 377f1e59411d15..f7c3381169ff8e 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -112,6 +112,7 @@ enum { #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CWND 24 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43d3a..4dd46b2ddf2c1b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -693,6 +693,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, + { + .procname = "tcp_user_cwnd_max", + .data = &sysctl_tcp_user_cwnd_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_thin_linear_timeouts", .data = &sysctl_tcp_thin_linear_timeouts, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 29d240b87af1a3..388d7a8e503de8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2666,6 +2666,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_CWND: + if (sysctl_tcp_user_cwnd_max <= 0) + err = -EPERM; + else if (val > 0 && sk->sk_state == TCP_ESTABLISHED && + icsk->icsk_ca_state == TCP_CA_Open) { + u32 cwnd = val; + cwnd = min(cwnd, (u32)sysctl_tcp_user_cwnd_max); + cwnd = min(cwnd, tp->snd_cwnd_clamp); + + if (tp->snd_cwnd != cwnd) { + tp->snd_cwnd = cwnd; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_cnt = 0; + } + } else + err = -EINVAL; + break; + #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: /* Read the IP->Key mappings from userspace */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 91b98e5a17aa7d..987ade594619d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,6 +67,7 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); +int sysctl_tcp_user_cwnd_max __read_mostly; static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); From 56667eb6f60b7bb101e8eed3ef092cadd3073455 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 12 Feb 2013 18:46:32 -0800 Subject: [PATCH 02/10] add extra free kbytes tunable Add a userspace visible knob to tell the VM to keep an extra amount of memory free, by increasing the gap between each zone's min and low watermarks. This is useful for realtime applications that call system calls and have a bound on the number of allocations that happen in any short time period. In this application, extra_free_kbytes would be left at an amount equal to or larger than than the maximum number of allocations that happen in any burst. It may also be useful to reduce the memory use of virtual machines (temporarily?), in a way that does not cause memory fragmentation like ballooning does. --- Documentation/sysctl/vm.txt | 16 +++++++++++++++ include/linux/mmzone.h | 2 +- include/linux/swap.h | 2 ++ kernel/sysctl.c | 10 +++++++++- mm/page_alloc.c | 39 +++++++++++++++++++++++++++---------- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index b602784b289fb5..0c17688632a2bd 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -204,6 +205,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugepages_treat_as_movable This parameter controls whether we can allocate hugepages from ZONE_MOVABLE diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18843532a0c93e..a821824b841e7c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -897,7 +897,7 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, +int free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/swap.h b/include/linux/swap.h index 78932497680197..ae06917c82136c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -259,6 +259,8 @@ struct swap_info_struct { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern int min_free_kbytes; +extern int extra_free_kbytes; extern unsigned long dirty_balance_reserve; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c1b26e176aa6f6..40fbe8c7f453fe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1314,9 +1314,17 @@ static struct ctl_table vm_table[] = { .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, + .proc_handler = free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b258297cc7c98..d8d2351add0b30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -205,9 +205,22 @@ static char * const zone_names[MAX_NR_ZONES] = { "Movable", }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +/* + * Extra memory for the system to try freeing between the min and + * low watermarks. Useful for workloads that require low latency + * memory allocations in bursts larger than the normal gap between + * low and min. + */ +int extra_free_kbytes; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -5625,6 +5638,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -5636,11 +5650,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -5661,11 +5678,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -5787,11 +5806,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. + * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * or extra_free_kbytes changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; From b53ab0a126dae9aa51c6f07907d8cb2351abd20d Mon Sep 17 00:00:00 2001 From: dormando Date: Wed, 13 Feb 2013 11:27:55 -0800 Subject: [PATCH 03/10] Don't change initcwnd's magic number fucks us right up, it does. --- include/uapi/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index f7c3381169ff8e..3afe7676e5f584 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -105,14 +105,14 @@ enum { #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ -#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR 26 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ -#define TCP_CWND 24 /* Set congestion window */ +#define TCP_CWND 19 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; From 2ebc8b754de3335f9a71a7585e29532bb85f494d Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Mon, 21 Oct 2013 23:43:37 +0000 Subject: [PATCH 04/10] match the reuseport sk based on the smp_processor_id of the kernel thread dealing with the irq save the cpu id so we dont have to go over it again --- net/ipv4/inet_hashtables.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450d6c..b8c9d5545bf25e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -210,7 +210,8 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; - u32 phash = 0; + // u32 phash = 0; + int curr_cpu = smp_processor_id(); rcu_read_lock(); begin: @@ -223,15 +224,27 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - phash = inet_ehashfn(net, daddr, hnum, + // matches++; + + /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - matches = 1; + matches = 1; */ } } else if (score == hiscore && reuseport) { - matches++; + + /* goes through the sks and find the one corresponding to our cpu + it is critical that a RSS queue is bound to a specific cpu + */ + // pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); + if (matches++ == curr_cpu) { + result = sk; + } + + /* if (((u64)phash * matches) >> 32 == 0) result = sk; phash = next_pseudo_random32(phash); + */ } } /* From 2085c3a3596d7c336c6b40178e6f6617ca6db6af Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Wed, 23 Oct 2013 06:54:28 +0000 Subject: [PATCH 05/10] pass syncookie down and dont do multipath for them --- include/net/inet_connection_sock.h | 10 ++++++---- include/net/route.h | 14 +++++++------- net/ipv4/icmp.c | 4 ++-- net/ipv4/inet_connection_sock.c | 7 ++++--- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 20 ++++++++++---------- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/udp.c | 2 +- net/ipv4/xfrm4_policy.c | 2 +- 10 files changed, 36 insertions(+), 33 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index cf9272807788de..9603a517db1daf 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -253,10 +253,12 @@ int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); -struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, - const struct request_sock *req); +struct dst_entry* inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, + const struct request_sock *req, int syncookie); +struct dst_entry* inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req); static inline void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, diff --git a/include/net/route.h b/include/net/route.h index 9d1f423d5944bc..38e5a463699d81 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -108,15 +108,15 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp, int syncookie); struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, - struct sock *sk); + struct sock *sk, int syncookie); struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp) { - return ip_route_output_flow(net, flp, NULL); + return ip_route_output_flow(net, flp, NULL, 0); } static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, @@ -143,7 +143,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi daddr, saddr, dport, sport); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(net, fl4, sk); + return ip_route_output_flow(net, fl4, sk, 0); } static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, @@ -266,14 +266,14 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, sport, dport, sk); if (!dst || !src) { - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (IS_ERR(rt)) return rt; ip_rt_put(rt); flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(net, fl4, sk); + return ip_route_output_flow(net, fl4, sk, 0); } static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, @@ -289,7 +289,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable RT_CONN_FLAGS(sk), fl4->daddr, fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(sock_net(sk), fl4, sk); + return ip_route_output_flow(sock_net(sk), fl4, sk, 0); } return rt; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1e4aa8354f93d4..ed55c278b4f9a5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,7 +399,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (IS_ERR(rt)) return rt; @@ -421,7 +421,7 @@ static struct rtable *icmp_route_lookup(struct net *net, goto relookup_failed; if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { - rt2 = __ip_route_output_key(net, &fl4_dec); + rt2 = __ip_route_output_key(net, &fl4_dec, 0); if (IS_ERR(rt2)) err = PTR_ERR(rt2); } else { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0d1e2cb877ec43..0d3b4c9d72c14f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -400,7 +400,8 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req) + const struct request_sock *req, + int want_cookie) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -415,7 +416,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, want_cookie); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) @@ -451,7 +452,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, 0); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 0d33f947a87f2f..4b5658fc307db9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -783,7 +783,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 11c8d81fdc59c0..60c57a719d7b48 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -586,7 +586,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 487bb62525208c..123cc9ed09207e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1009,7 +1009,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); - rt = __ip_route_output_key(net, &fl4); + rt = __ip_route_output_key(net, &fl4, 0); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); @@ -1024,7 +1024,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + rt = __ip_route_output_key(sock_net(sk), &fl4, 0); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); @@ -1055,7 +1055,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) rt = (struct rtable *)odst; if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) goto out; @@ -1068,7 +1068,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (new) dst_release(&rt->dst); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) goto out; @@ -1093,7 +1093,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); - rt = __ip_route_output_key(net, &fl4); + rt = __ip_route_output_key(net, &fl4, 0); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -1108,7 +1108,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + rt = __ip_route_output_key(sock_net(sk), &fl4, 0); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -2001,7 +2001,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4, int syncookie) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2157,7 +2157,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + if (syncookie == 0 && res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) fib_select_multipath(&res); else #endif @@ -2260,9 +2260,9 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + struct sock *sk, int syncookie) { - struct rtable *rt = __ip_route_output_key(net, flp4); + struct rtable *rt = __ip_route_output_key(net, flp4, syncookie); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a782d5be132e59..5051f74e626c8d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -831,7 +831,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff *skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, 0)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, NULL); @@ -1530,7 +1530,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + (dst = inet_csk_route_req(sk, &fl4, req, 0)) != NULL && fl4.daddr == saddr) { if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1559,7 +1559,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_isn = isn; if (dst == NULL) { - dst = inet_csk_route_req(sk, &fl4, req); + dst = inet_csk_route_req(sk, &fl4, req, want_cookie); if (dst == NULL) goto drop_and_free; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b25e852625d8af..76b73960a193a4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -990,7 +990,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a96789..74ff05de52e7c9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (!IS_ERR(rt)) return &rt->dst; From 76b4fe2b30dc38fab7fa94a4150be587b1806053 Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Thu, 24 Oct 2013 06:50:44 +0000 Subject: [PATCH 06/10] skip the first queue entry from RSS --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 18076c4178b4ff..1a3b16fc7c9576 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3250,9 +3250,9 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), seed[i]); /* Fill out redirection table */ - for (i = 0, j = 0; i < 128; i++, j++) { + for (i = 0, j = 1; i < 128; i++, j++) { if (j == rss_i) - j = 0; + j = 1; /* reta = 4-byte sliding window of * 0x00..(indices-1)(indices-1)00..etc. */ reta = (reta << 8) | (j * 0x11); From ef585d093af1269e2e6dfbb1b6fb4bc8fce2ec51 Mon Sep 17 00:00:00 2001 From: dormando Date: Sat, 28 Jun 2014 17:37:34 -0700 Subject: [PATCH 07/10] accept a second number for TCP_CWND pivoting to 99 to make future maintenance simpler. --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3afe7676e5f584..b4294e2bf0636d 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -113,6 +113,7 @@ enum { #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ #define TCP_CWND 19 /* Set congestion window */ +#define TCP_CWND2 99 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 388d7a8e503de8..ae20cd65438544 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2667,6 +2667,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CWND: + case TCP_CWND2: if (sysctl_tcp_user_cwnd_max <= 0) err = -EPERM; else if (val > 0 && sk->sk_state == TCP_ESTABLISHED && From e54595db8ab2274855baa87151cbdebb33c60b54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Taveira=20Ara=C3=BAjo?= Date: Sun, 4 May 2014 21:35:34 +0200 Subject: [PATCH 08/10] Add TCP_FASTLY_INFO, export nexthop used. Conflicts: net/ipv4/tcp.c --- include/uapi/linux/tcp.h | 14 ++++++++++++++ net/ipv4/tcp.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b4294e2bf0636d..cfd40acabc97d9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -115,6 +115,8 @@ enum { #define TCP_CWND 19 /* Set congestion window */ #define TCP_CWND2 99 /* Set congestion window */ +#define TCP_FASTLY_INFO 66 /* Additional info about connection. */ + struct tcp_repair_opt { __u32 opt_code; __u32 opt_val; @@ -190,6 +192,18 @@ struct tcp_info { __u32 tcpi_total_retrans; }; +struct tcp_fst_info { + __u8 version; + __u8 tos; + __u16 __unused; + union { + struct in_addr nexthop; + struct in6_addr nexthop6; + }; + __u32 __pad[11]; + struct tcp_info info; +}; + /* for TCP_MD5SIG socket option */ #define TCP_MD5SIG_MAXKEYLEN 80 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ae20cd65438544..de3cc77edb4a60 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2819,6 +2819,29 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +/* Extend tcp_info with nexthop info. */ +void tcp_get_fst_info(const struct sock *sk, struct tcp_fst_info *fst) +{ + struct inet_sock *inet = inet_sk(sk); + const struct dst_entry *dst = __sk_dst_get((struct sock *) sk); + + fst->version = 1; + fst->tos = inet->tos; + + if (!dst) { + memset(&fst->nexthop6, 0, sizeof(struct in6_addr)); + } else if (sk->sk_family == AF_INET) { + const struct rtable *rt = (const struct rtable *) dst; + memcpy(&fst->nexthop, &rt->rt_gateway, sizeof(struct in_addr)); + } else if (sk->sk_family == AF_INET6) { + const struct rt6_info *rt = (const struct rt6_info *) dst; + memcpy(&fst->nexthop6, &rt->rt6i_gateway, sizeof(struct in6_addr)); + } + + tcp_get_info(sk, &fst->info); +} +EXPORT_SYMBOL_GPL(tcp_get_fst_info); + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -2937,6 +2960,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_FASTLY_INFO: { + struct tcp_fst_info fst; + + if (get_user(len, optlen)) + return -EFAULT; + + tcp_get_fst_info(sk, &fst); + + len = min_t(unsigned int, len, sizeof(fst)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &fst, len)) + return -EFAULT; + return 0; + } default: return -ENOPROTOOPT; } From e68dc408ffc81aaf7d9ddfcba928115ae7033e81 Mon Sep 17 00:00:00 2001 From: dormando Date: Wed, 17 Dec 2014 13:09:46 -0800 Subject: [PATCH 09/10] Revert "ipv4: fix race in concurrent ip_route_input_slow()" This reverts commit dcdfdf56b4a6c9437fc37dbc9cee94a788f9b0c4. --- net/ipv4/route.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 123cc9ed09207e..df59c13c7f21b4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1786,12 +1786,8 @@ out: return err; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { - rth->dst.flags |= DST_NOCACHE; - rt_add_uncached_list(rth); - } - } + if (do_cache) + rt_cache_route(&FIB_RES_NH(res), rth); skb_dst_set(skb, &rth->dst); err = 0; goto out; From 3fcc688fe2ce3f02da0506771522ede39f31ff18 Mon Sep 17 00:00:00 2001 From: "Devon H. O'Dell" Date: Fri, 26 Dec 2014 16:20:22 -0800 Subject: [PATCH 10/10] net/tcp: Add sockopt to allow TIME_WAIT reuse When both the requesting socket and the socket found in TIME_WAIT state are requesting to reuse TIME_WAIT-state sockets, allow us to find the TW socket as unique. Previously, this was gated on a global sysctl, which is not ideal for our use case. --- include/net/sock.h | 5 ++++- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 4 ++++ net/ipv4/tcp_ipv4.c | 5 +++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index f66b2b19a6e445..d91bd62763fe39 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -141,6 +141,7 @@ typedef __u64 __bitwise __addrpair; * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting + * @skc_tw_reuse: %SO_FASTLY_TW_REUSE setting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -181,7 +182,8 @@ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; - unsigned char skc_reuseport:4; + unsigned char skc_reuseport:2; + unsigned char skc_tw_reuse:2; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -316,6 +318,7 @@ struct sock { #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport +#define sk_tw_reuse __sk_common.skc_tw_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index ea0796bdcf8840..4677b2ea9b39b5 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -82,4 +82,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_FASTLY_TW_REUSE 49 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index c8069561bdb7be..2db194c764c2a1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -964,6 +964,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_max_pacing_rate); break; + case SO_FASTLY_TW_REUSE: + sk->sk_tw_reuse = valbool; + break; + default: ret = -ENOPROTOOPT; break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5051f74e626c8d..4450e7db49d13d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -124,8 +124,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (twp == NULL || (sysctl_tcp_tw_reuse && - get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (twp == NULL || + ((sk->sk_tw_reuse && sktw->sk_tw_reuse) || sysctl_tcp_tw_reuse) && + get_seconds() - tcptw->tw_ts_recent_stamp > 1)) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1;