diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc index e7e9ebd2bbd..9ad2ea32285 100644 --- a/iocore/dns/DNS.cc +++ b/iocore/dns/DNS.cc @@ -524,12 +524,12 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc } return false; } else { - ns_down[icon] = 0; if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) { Error("[iocore_dns] open_con: Failed to add %d server to epoll list\n", icon); } else { - cur_con.num = icon; - Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon); + cur_con.num = icon; + ns_down[icon] = 0; + Debug("dns", "opening connection %s on fd %d SUCCEEDED for %d", ip_text, cur_con.fd, icon); } ret = true; } @@ -729,6 +729,15 @@ void DNSHandler::failover() { Debug("dns", "failover: initiating failover attempt, current name_server=%d", name_server); + if (!ns_down[name_server]) { + ip_text_buffer buff; + // mark this nameserver as down + Debug("dns", "failover: Marking nameserver %d as down", name_server); + ns_down[name_server] = 1; + Warning("connection to DNS server %s lost, marking as down", + ats_ip_ntop(&m_res->nsaddr_list[name_server].sa, buff, sizeof(buff))); + } + // no hope, if we have only one server if (m_res->nscount > 1) { ip_text_buffer buff1, buff2; @@ -766,6 +775,8 @@ DNSHandler::failover() ip_text_buffer buff; Warning("failover: connection to DNS server %s lost, retrying", ats_ip_ntop(&ip.sa, buff, sizeof(buff))); } + // Make sure retries are done even if no more requests. + this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD); } /** Mark one of the nameservers as down. */ @@ -779,6 +790,8 @@ DNSHandler::rr_failure(int ndx) Debug("dns", "rr_failure: Marking nameserver %d as down", ndx); ns_down[ndx] = 1; Warning("connection to DNS server %s lost, marking as down", ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff))); + // Make sure retries are done even if no more requests. + this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD); } int nscount = m_res->nscount; @@ -971,6 +984,11 @@ DNSHandler::check_and_reset_tcp_conn() int DNSHandler::mainEvent(int event, Event *e) { + // If this was a scheduled retry event, clear the associated flag. + if (e && e->cookie == RETRY_COOKIE) { + this->nameserver_retry_in_flight_p = false; + } + recv_dns(event, e); if (dns_ns_rr) { if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) { @@ -1017,10 +1035,6 @@ DNSHandler::mainEvent(int event, Event *e) write_dns(this); } - if (std::any_of(ns_down, ns_down + n_con, [](int f) { return f != 0; })) { - this_ethread()->schedule_at(this, DNS_PRIMARY_RETRY_PERIOD); - } - return EVENT_CONT; } diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h index 4c16f32f993..7d0bba98d2c 100644 --- a/iocore/dns/P_DNSProcessor.h +++ b/iocore/dns/P_DNSProcessor.h @@ -169,9 +169,16 @@ struct DNSHandler : public Continuation { DNSConnection udpcon[MAX_NAMED]; Queue entries; Queue triggered; - int in_flight = 0; - int name_server = 0; - int in_write_dns = 0; + int in_flight = 0; + int name_server = 0; + int in_write_dns = 0; + /// Rate limiter for down nameserver retries. + /// Don't schedule another if there is already one in flight. + std::atomic nameserver_retry_in_flight_p{false}; + /// Marker for event cookie to indicate it's a nameserver retry event. + /// @note Can't be @c constexpr because of the cast. + static inline void *const RETRY_COOKIE{reinterpret_cast(0x2)}; + HostEnt *hostent_cache = nullptr; int ns_down[MAX_NAMED]; @@ -212,16 +219,16 @@ struct DNSHandler : public Continuation { (ink_hrtime)HRTIME_SECONDS(dns_failover_period)); Debug("dns", "\tdelta time is %" PRId64 "", (Thread::get_hrtime() - crossed_failover_number[i])); } - return (crossed_failover_number[i] && - ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period))); + return ns_down[i] || (crossed_failover_number[i] && + ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period))); } bool failover_soon(int i) { - return (crossed_failover_number[i] && - ((Thread::get_hrtime() - crossed_failover_number[i]) > - (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY)))); + return ns_down[i] || (crossed_failover_number[i] && + ((Thread::get_hrtime() - crossed_failover_number[i]) > + (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY)))); } void recv_dns(int event, Event *e); diff --git a/src/traffic_server/InkAPITest.cc b/src/traffic_server/InkAPITest.cc index cd000967bbf..21d61abfa50 100644 --- a/src/traffic_server/InkAPITest.cc +++ b/src/traffic_server/InkAPITest.cc @@ -83,6 +83,8 @@ #define ERROR_BODY "TESTING ERROR PAGE" #define TRANSFORM_APPEND_STRING "This is a transformed response" +extern int dns_failover_period; + ////////////////////////////////////////////////////////////////////////////// // STRUCTURES ////////////////////////////////////////////////////////////////////////////// @@ -8525,6 +8527,11 @@ EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectIntercept)(RegressionTest *test, EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectServerIntercept)(RegressionTest *test, int /* atype ATS_UNUSED */, int *pstatus) { *pstatus = REGRESSION_TEST_INPROGRESS; + // These tests don't actually require DNS resolution, but if there are issues + // connecting to the DNS server then the test will fail due to error events + // from the communication arising. We avoid this by simply extending the DNS + // failover period to a very large value. + dns_failover_period = 1000; TSDebug(UTDBG_TAG, "Starting test TSHttpConnectServerIntercept");