diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index b602784b289fb5..0c17688632a2bd 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -204,6 +205,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugepages_treat_as_movable This parameter controls whether we can allocate hugepages from ZONE_MOVABLE diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 18076c4178b4ff..1a3b16fc7c9576 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3250,9 +3250,9 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), seed[i]); /* Fill out redirection table */ - for (i = 0, j = 0; i < 128; i++, j++) { + for (i = 0, j = 1; i < 128; i++, j++) { if (j == rss_i) - j = 0; + j = 1; /* reta = 4-byte sliding window of * 0x00..(indices-1)(indices-1)00..etc. */ reta = (reta << 8) | (j * 0x11); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18843532a0c93e..a821824b841e7c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -897,7 +897,7 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, +int free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/swap.h b/include/linux/swap.h index 78932497680197..ae06917c82136c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -259,6 +259,8 @@ struct swap_info_struct { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern int min_free_kbytes; +extern int extra_free_kbytes; extern unsigned long dirty_balance_reserve; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index cf9272807788de..9603a517db1daf 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -253,10 +253,12 @@ int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); -struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, - const struct request_sock *req); +struct dst_entry* inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, + const struct request_sock *req, int syncookie); +struct dst_entry* inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req); static inline void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, diff --git a/include/net/route.h b/include/net/route.h index 9d1f423d5944bc..38e5a463699d81 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -108,15 +108,15 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp, int syncookie); struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, - struct sock *sk); + struct sock *sk, int syncookie); struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp) { - return ip_route_output_flow(net, flp, NULL); + return ip_route_output_flow(net, flp, NULL, 0); } static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, @@ -143,7 +143,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi daddr, saddr, dport, sport); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(net, fl4, sk); + return ip_route_output_flow(net, fl4, sk, 0); } static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, @@ -266,14 +266,14 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, sport, dport, sk); if (!dst || !src) { - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (IS_ERR(rt)) return rt; ip_rt_put(rt); flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(net, fl4, sk); + return ip_route_output_flow(net, fl4, sk, 0); } static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, @@ -289,7 +289,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable RT_CONN_FLAGS(sk), fl4->daddr, fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - return ip_route_output_flow(sock_net(sk), fl4, sk); + return ip_route_output_flow(sock_net(sk), fl4, sk, 0); } return rt; } diff --git a/include/net/sock.h b/include/net/sock.h index f66b2b19a6e445..d91bd62763fe39 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -141,6 +141,7 @@ typedef __u64 __bitwise __addrpair; * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting + * @skc_tw_reuse: %SO_FASTLY_TW_REUSE setting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -181,7 +182,8 @@ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; - unsigned char skc_reuseport:4; + unsigned char skc_reuseport:2; + unsigned char skc_tw_reuse:2; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -316,6 +318,7 @@ struct sock { #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport +#define sk_tw_reuse __sk_common.skc_tw_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f0d8479e15f08..e1a4af82199491 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -283,6 +283,7 @@ extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; +extern int sysctl_tcp_user_cwnd_max; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index ea0796bdcf8840..4677b2ea9b39b5 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -82,4 +82,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_FASTLY_TW_REUSE 49 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 377f1e59411d15..cfd40acabc97d9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -105,13 +105,17 @@ enum { #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ -#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR 26 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CWND 19 /* Set congestion window */ +#define TCP_CWND2 99 /* Set congestion window */ + +#define TCP_FASTLY_INFO 66 /* Additional info about connection. */ struct tcp_repair_opt { __u32 opt_code; @@ -188,6 +192,18 @@ struct tcp_info { __u32 tcpi_total_retrans; }; +struct tcp_fst_info { + __u8 version; + __u8 tos; + __u16 __unused; + union { + struct in_addr nexthop; + struct in6_addr nexthop6; + }; + __u32 __pad[11]; + struct tcp_info info; +}; + /* for TCP_MD5SIG socket option */ #define TCP_MD5SIG_MAXKEYLEN 80 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c1b26e176aa6f6..40fbe8c7f453fe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1314,9 +1314,17 @@ static struct ctl_table vm_table[] = { .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, + .proc_handler = free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b258297cc7c98..d8d2351add0b30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -205,9 +205,22 @@ static char * const zone_names[MAX_NR_ZONES] = { "Movable", }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +/* + * Extra memory for the system to try freeing between the min and + * low watermarks. Useful for workloads that require low latency + * memory allocations in bursts larger than the normal gap between + * low and min. + */ +int extra_free_kbytes; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -5625,6 +5638,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -5636,11 +5650,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -5661,11 +5678,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -5787,11 +5806,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. + * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * or extra_free_kbytes changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; diff --git a/net/core/sock.c b/net/core/sock.c index c8069561bdb7be..2db194c764c2a1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -964,6 +964,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_max_pacing_rate); break; + case SO_FASTLY_TW_REUSE: + sk->sk_tw_reuse = valbool; + break; + default: ret = -ENOPROTOOPT; break; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1e4aa8354f93d4..ed55c278b4f9a5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,7 +399,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (IS_ERR(rt)) return rt; @@ -421,7 +421,7 @@ static struct rtable *icmp_route_lookup(struct net *net, goto relookup_failed; if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { - rt2 = __ip_route_output_key(net, &fl4_dec); + rt2 = __ip_route_output_key(net, &fl4_dec, 0); if (IS_ERR(rt2)) err = PTR_ERR(rt2); } else { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0d1e2cb877ec43..0d3b4c9d72c14f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -400,7 +400,8 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req) + const struct request_sock *req, + int want_cookie) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -415,7 +416,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, want_cookie); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) @@ -451,7 +452,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, 0); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450d6c..b8c9d5545bf25e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -210,7 +210,8 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; - u32 phash = 0; + // u32 phash = 0; + int curr_cpu = smp_processor_id(); rcu_read_lock(); begin: @@ -223,15 +224,27 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - phash = inet_ehashfn(net, daddr, hnum, + // matches++; + + /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - matches = 1; + matches = 1; */ } } else if (score == hiscore && reuseport) { - matches++; + + /* goes through the sks and find the one corresponding to our cpu + it is critical that a RSS queue is bound to a specific cpu + */ + // pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); + if (matches++ == curr_cpu) { + result = sk; + } + + /* if (((u64)phash * matches) >> 32 == 0) result = sk; phash = next_pseudo_random32(phash); + */ } } /* diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 0d33f947a87f2f..4b5658fc307db9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -783,7 +783,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 11c8d81fdc59c0..60c57a719d7b48 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -586,7 +586,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 487bb62525208c..df59c13c7f21b4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1009,7 +1009,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); - rt = __ip_route_output_key(net, &fl4); + rt = __ip_route_output_key(net, &fl4, 0); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); @@ -1024,7 +1024,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + rt = __ip_route_output_key(sock_net(sk), &fl4, 0); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); @@ -1055,7 +1055,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) rt = (struct rtable *)odst; if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) goto out; @@ -1068,7 +1068,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (new) dst_release(&rt->dst); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk, 0); if (IS_ERR(rt)) goto out; @@ -1093,7 +1093,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); - rt = __ip_route_output_key(net, &fl4); + rt = __ip_route_output_key(net, &fl4, 0); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -1108,7 +1108,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + rt = __ip_route_output_key(sock_net(sk), &fl4, 0); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -1786,12 +1786,8 @@ out: return err; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { - rth->dst.flags |= DST_NOCACHE; - rt_add_uncached_list(rth); - } - } + if (do_cache) + rt_cache_route(&FIB_RES_NH(res), rth); skb_dst_set(skb, &rth->dst); err = 0; goto out; @@ -2001,7 +1997,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4, int syncookie) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2157,7 +2153,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + if (syncookie == 0 && res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) fib_select_multipath(&res); else #endif @@ -2260,9 +2256,9 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + struct sock *sk, int syncookie) { - struct rtable *rt = __ip_route_output_key(net, flp4); + struct rtable *rt = __ip_route_output_key(net, flp4, syncookie); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43d3a..4dd46b2ddf2c1b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -693,6 +693,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, + { + .procname = "tcp_user_cwnd_max", + .data = &sysctl_tcp_user_cwnd_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_thin_linear_timeouts", .data = &sysctl_tcp_thin_linear_timeouts, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 29d240b87af1a3..de3cc77edb4a60 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2666,6 +2666,25 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_CWND: + case TCP_CWND2: + if (sysctl_tcp_user_cwnd_max <= 0) + err = -EPERM; + else if (val > 0 && sk->sk_state == TCP_ESTABLISHED && + icsk->icsk_ca_state == TCP_CA_Open) { + u32 cwnd = val; + cwnd = min(cwnd, (u32)sysctl_tcp_user_cwnd_max); + cwnd = min(cwnd, tp->snd_cwnd_clamp); + + if (tp->snd_cwnd != cwnd) { + tp->snd_cwnd = cwnd; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_cnt = 0; + } + } else + err = -EINVAL; + break; + #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: /* Read the IP->Key mappings from userspace */ @@ -2800,6 +2819,29 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +/* Extend tcp_info with nexthop info. */ +void tcp_get_fst_info(const struct sock *sk, struct tcp_fst_info *fst) +{ + struct inet_sock *inet = inet_sk(sk); + const struct dst_entry *dst = __sk_dst_get((struct sock *) sk); + + fst->version = 1; + fst->tos = inet->tos; + + if (!dst) { + memset(&fst->nexthop6, 0, sizeof(struct in6_addr)); + } else if (sk->sk_family == AF_INET) { + const struct rtable *rt = (const struct rtable *) dst; + memcpy(&fst->nexthop, &rt->rt_gateway, sizeof(struct in_addr)); + } else if (sk->sk_family == AF_INET6) { + const struct rt6_info *rt = (const struct rt6_info *) dst; + memcpy(&fst->nexthop6, &rt->rt6i_gateway, sizeof(struct in6_addr)); + } + + tcp_get_info(sk, &fst->info); +} +EXPORT_SYMBOL_GPL(tcp_get_fst_info); + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -2918,6 +2960,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_FASTLY_INFO: { + struct tcp_fst_info fst; + + if (get_user(len, optlen)) + return -EFAULT; + + tcp_get_fst_info(sk, &fst); + + len = min_t(unsigned int, len, sizeof(fst)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &fst, len)) + return -EFAULT; + return 0; + } default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a782d5be132e59..4450e7db49d13d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -124,8 +124,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (twp == NULL || (sysctl_tcp_tw_reuse && - get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (twp == NULL || + ((sk->sk_tw_reuse && sktw->sk_tw_reuse) || sysctl_tcp_tw_reuse) && + get_seconds() - tcptw->tw_ts_recent_stamp > 1)) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -831,7 +832,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff *skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, 0)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, NULL); @@ -1530,7 +1531,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + (dst = inet_csk_route_req(sk, &fl4, req, 0)) != NULL && fl4.daddr == saddr) { if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1559,7 +1560,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_isn = isn; if (dst == NULL) { - dst = inet_csk_route_req(sk, &fl4, req); + dst = inet_csk_route_req(sk, &fl4, req, want_cookie); if (dst == NULL) goto drop_and_free; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 91b98e5a17aa7d..987ade594619d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,6 +67,7 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); +int sysctl_tcp_user_cwnd_max __read_mostly; static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b25e852625d8af..76b73960a193a4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -990,7 +990,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); - rt = ip_route_output_flow(net, fl4, sk); + rt = ip_route_output_flow(net, fl4, sk, 0); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a96789..74ff05de52e7c9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(net, fl4, 0); if (!IS_ERR(rt)) return &rt->dst;