From 6a234149c8e8f4308e74eabb7fb9e9ed73769296 Mon Sep 17 00:00:00 2001 From: bneradt Date: Sat, 19 Feb 2022 22:59:19 +0000 Subject: [PATCH] Add back "DNS: Fix lack of nameserver failover in low use circumstances." This adds back the fix for nameserver failover in low use circumstances and addresses the regression test instability caused when the test box has connectivity issues with the DNS server. --- iocore/dns/DNS.cc | 28 +++++++++++++++++++++------- iocore/dns/P_DNSProcessor.h | 23 +++++++++++++++-------- src/traffic_server/InkAPITest.cc | 7 +++++++ 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc index e7e9ebd2bbd..9ad2ea32285 100644 --- a/iocore/dns/DNS.cc +++ b/iocore/dns/DNS.cc @@ -524,12 +524,12 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc } return false; } else { - ns_down[icon] = 0; if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) { Error("[iocore_dns] open_con: Failed to add %d server to epoll list\n", icon); } else { - cur_con.num = icon; - Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon); + cur_con.num = icon; + ns_down[icon] = 0; + Debug("dns", "opening connection %s on fd %d SUCCEEDED for %d", ip_text, cur_con.fd, icon); } ret = true; } @@ -729,6 +729,15 @@ void DNSHandler::failover() { Debug("dns", "failover: initiating failover attempt, current name_server=%d", name_server); + if (!ns_down[name_server]) { + ip_text_buffer buff; + // mark this nameserver as down + Debug("dns", "failover: Marking nameserver %d as down", name_server); + ns_down[name_server] = 1; + Warning("connection to DNS server %s lost, marking as down", + ats_ip_ntop(&m_res->nsaddr_list[name_server].sa, buff, sizeof(buff))); + } + // no hope, if we have only one server if (m_res->nscount > 1) { ip_text_buffer buff1, buff2; @@ -766,6 +775,8 @@ DNSHandler::failover() ip_text_buffer buff; Warning("failover: connection to DNS server %s lost, retrying", ats_ip_ntop(&ip.sa, buff, sizeof(buff))); } + // Make sure retries are done even if no more requests. + this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD); } /** Mark one of the nameservers as down. */ @@ -779,6 +790,8 @@ DNSHandler::rr_failure(int ndx) Debug("dns", "rr_failure: Marking nameserver %d as down", ndx); ns_down[ndx] = 1; Warning("connection to DNS server %s lost, marking as down", ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff))); + // Make sure retries are done even if no more requests. + this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD); } int nscount = m_res->nscount; @@ -971,6 +984,11 @@ DNSHandler::check_and_reset_tcp_conn() int DNSHandler::mainEvent(int event, Event *e) { + // If this was a scheduled retry event, clear the associated flag. + if (e && e->cookie == RETRY_COOKIE) { + this->nameserver_retry_in_flight_p = false; + } + recv_dns(event, e); if (dns_ns_rr) { if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) { @@ -1017,10 +1035,6 @@ DNSHandler::mainEvent(int event, Event *e) write_dns(this); } - if (std::any_of(ns_down, ns_down + n_con, [](int f) { return f != 0; })) { - this_ethread()->schedule_at(this, DNS_PRIMARY_RETRY_PERIOD); - } - return EVENT_CONT; } diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h index 4c16f32f993..7d0bba98d2c 100644 --- a/iocore/dns/P_DNSProcessor.h +++ b/iocore/dns/P_DNSProcessor.h @@ -169,9 +169,16 @@ struct DNSHandler : public Continuation { DNSConnection udpcon[MAX_NAMED]; Queue entries; Queue triggered; - int in_flight = 0; - int name_server = 0; - int in_write_dns = 0; + int in_flight = 0; + int name_server = 0; + int in_write_dns = 0; + /// Rate limiter for down nameserver retries. + /// Don't schedule another if there is already one in flight. + std::atomic nameserver_retry_in_flight_p{false}; + /// Marker for event cookie to indicate it's a nameserver retry event. + /// @note Can't be @c constexpr because of the cast. + static inline void *const RETRY_COOKIE{reinterpret_cast(0x2)}; + HostEnt *hostent_cache = nullptr; int ns_down[MAX_NAMED]; @@ -212,16 +219,16 @@ struct DNSHandler : public Continuation { (ink_hrtime)HRTIME_SECONDS(dns_failover_period)); Debug("dns", "\tdelta time is %" PRId64 "", (Thread::get_hrtime() - crossed_failover_number[i])); } - return (crossed_failover_number[i] && - ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period))); + return ns_down[i] || (crossed_failover_number[i] && + ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period))); } bool failover_soon(int i) { - return (crossed_failover_number[i] && - ((Thread::get_hrtime() - crossed_failover_number[i]) > - (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY)))); + return ns_down[i] || (crossed_failover_number[i] && + ((Thread::get_hrtime() - crossed_failover_number[i]) > + (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY)))); } void recv_dns(int event, Event *e); diff --git a/src/traffic_server/InkAPITest.cc b/src/traffic_server/InkAPITest.cc index cd000967bbf..21d61abfa50 100644 --- a/src/traffic_server/InkAPITest.cc +++ b/src/traffic_server/InkAPITest.cc @@ -83,6 +83,8 @@ #define ERROR_BODY "TESTING ERROR PAGE" #define TRANSFORM_APPEND_STRING "This is a transformed response" +extern int dns_failover_period; + ////////////////////////////////////////////////////////////////////////////// // STRUCTURES ////////////////////////////////////////////////////////////////////////////// @@ -8525,6 +8527,11 @@ EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectIntercept)(RegressionTest *test, EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectServerIntercept)(RegressionTest *test, int /* atype ATS_UNUSED */, int *pstatus) { *pstatus = REGRESSION_TEST_INPROGRESS; + // These tests don't actually require DNS resolution, but if there are issues + // connecting to the DNS server then the test will fail due to error events + // from the communication arising. We avoid this by simply extending the DNS + // failover period to a very large value. + dns_failover_period = 1000; TSDebug(UTDBG_TAG, "Starting test TSHttpConnectServerIntercept");