diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed91961..b81fca90f7fe4a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -198,6 +199,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugepages_treat_as_movable This parameter is only useful when kernelcore= is specified at boot time to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b1e..cf75a9d2e04513 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -891,7 +891,7 @@ static inline int is_dma(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, +int free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/swap.h b/include/linux/swap.h index 1701ce4be74650..3b3da79c3d2c4b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,6 +224,8 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern int min_free_kbytes; +extern int extra_free_kbytes; extern unsigned long dirty_balance_reserve; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/include/net/tcp.h b/include/net/tcp.h index 5bba80fbd1d9d9..90a780ef4b1b7a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -287,6 +287,7 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; +extern int sysctl_tcp_user_cwnd_max; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8d776ebc4829df..588390cf442380 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -105,12 +105,13 @@ enum { #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ -#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR 24 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ -#define TCP_TIMESTAMP 24 +#define TCP_TIMESTAMP 25 +#define TCP_CWND 19 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fcaa..e7ba63c8eda80c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1262,9 +1262,17 @@ static struct ctl_table vm_table[] = { .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, + .proc_handler = free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ee0fd313f036e..a6becb8adb111c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -196,8 +196,21 @@ static char * const zone_names[MAX_NR_ZONES] = { "Movable", }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; +/* + * Extra memory for the system to try freeing between the min and + * low watermarks. Useful for workloads that require low latency + * memory allocations in bursts larger than the normal gap between + * low and min. + */ +int extra_free_kbytes; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -5320,6 +5333,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -5331,11 +5345,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -5356,11 +5373,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -5471,11 +5490,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. + * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * or extra_free_kbytes changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6af375afeeef1e..970c2180132a33 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -185,7 +185,8 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; - u32 phash = 0; + // u32 phash = 0; + int curr_cpu = smp_processor_id(); rcu_read_lock(); begin: @@ -198,15 +199,27 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - phash = inet_ehashfn(net, daddr, hnum, + // matches++; + + /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - matches = 1; + matches = 1; */ } } else if (score == hiscore && reuseport) { - matches++; + + /* goes through the sks and find the one corresponding to our cpu + it is critical that a RSS queue is bound to a specific cpu + */ + // pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); + if (matches++ == curr_cpu) { + result = sk; + } + + /* if (((u64)phash * matches) >> 32 == 0) result = sk; phash = next_pseudo_random32(phash); + */ } } /* diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3f25e75ae692e9..38a470f86887c7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -722,6 +722,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, + { + .procname = "tcp_user_cwnd_max", + .data = &sysctl_tcp_user_cwnd_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_max_ssthresh", .data = &sysctl_tcp_max_ssthresh, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2005561861ad03..e5598716798bc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2602,6 +2602,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_CWND: + if (sysctl_tcp_user_cwnd_max <= 0) + err = -EPERM; + else if (val > 0 && sk->sk_state == TCP_ESTABLISHED && + icsk->icsk_ca_state == TCP_CA_Open) { + u32 cwnd = val; + cwnd = min(cwnd, (u32)sysctl_tcp_user_cwnd_max); + cwnd = min(cwnd, tp->snd_cwnd_clamp); + + if (tp->snd_cwnd != cwnd) { + tp->snd_cwnd = cwnd; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_cnt = 0; + } + } else + err = -EINVAL; + break; + #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: /* Read the IP->Key mappings from userspace */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4b75aad14b04a0..28bdc144cab642 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2773,7 +2773,20 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + if (WARN_ON(tp->retrans_out != 0)) { + printk(KERN_DEBUG "%pI4:%u F0x%x S%u s%d IF%u+%u-%u-%u" + "f%u ur%u rr%u rt%u um%u hs%u nxt%u\n", + &inet_sk(sk)->inet_daddr, + ntohs(inet_sk(sk)->inet_dport), + flag, sk->sk_state, tp->rx_opt.sack_ok, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out, + tp->frto, tp->undo_retrans, + tp->reordering, icsk->icsk_retransmits, + tp->undo_marker ? tp->undo_marker-tp->snd_una:0, + tp->high_seq - tp->snd_una, + tp->snd_nxt - tp->snd_una); + } tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -3314,7 +3327,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_init_cwnd_reduction(sk, true); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_Open); + tcp_try_keep_open(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0145ce7e609881..7f6a9c7f17988f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,6 +65,8 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +int sysctl_tcp_user_cwnd_max __read_mostly; + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp);