From ad34474367be7700d6d34fa4aa2c2b27adff3635 Mon Sep 17 00:00:00 2001 From: Ed Coleman Date: Wed, 24 Apr 2024 16:10:26 +0000 Subject: [PATCH 1/6] Add scan server metrics --- .../core/metrics/MetricsProducer.java | 27 +++++++ .../apache/accumulo/tserver/ScanServer.java | 45 ++++++++++-- .../accumulo/tserver/ScanServerMetrics.java | 70 +++++++++++++++++++ .../accumulo/tserver/ScanServerTest.java | 4 ++ .../accumulo/test/metrics/MetricsIT.java | 5 +- 5 files changed, 146 insertions(+), 5 deletions(-) create mode 100644 server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 568b295f580..917922d7892 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -367,6 +367,28 @@ * Gauge * * + * + * + * N/A + * N/A + * {@value #METRICS_SSERVER_REGISTRATION_TIMER} + * Timer + * Time to reserve a tablets files for scan + * + * + * N/A + * N/A + * {@value #METRICS_SSERVER_BUSY_COUNTER} + * Counter + * Count of the scans where a busy timeout happened + * + * + * N/A + * N/A + * {@value #METRICS_SSERVER_TABLET_METADATA_CACHE} + * Cache + * scan server tablet cache metrics + * * * * scan @@ -638,6 +660,11 @@ public interface MetricsProducer { String METRICS_TSERVER_SCAN_RESULTS_BYTES = METRICS_TSERVER_PREFIX + "scan.results.bytes"; String METRICS_TSERVER_SCANNED_ENTRIES = METRICS_TSERVER_PREFIX + "scan.scanned.entries"; + String METRICS_SSERVER_PREFIX = "accumulo.sserver."; + String METRICS_SSERVER_REGISTRATION_TIMER = METRICS_SSERVER_PREFIX + "registration.timer"; + String METRICS_SSERVER_BUSY_COUNTER = METRICS_SSERVER_PREFIX + "busy.count"; + String METRICS_SSERVER_TABLET_METADATA_CACHE = METRICS_SSERVER_PREFIX + "tablet.metadata.cache"; + String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; String METRICS_THRIFT_IDLE = METRICS_THRIFT_PREFIX + "idle"; diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index bcba26aa135..813245b07d2 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -76,6 +76,7 @@ import org.apache.accumulo.core.tabletserver.thrift.ActiveScan; import org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException; import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException; +import org.apache.accumulo.core.tabletserver.thrift.ScanServerBusyException; import org.apache.accumulo.core.tabletserver.thrift.TSampleNotPresentException; import org.apache.accumulo.core.tabletserver.thrift.TSamplerConfiguration; import org.apache.accumulo.core.tabletserver.thrift.TabletScanClientService; @@ -199,6 +200,7 @@ private TabletMetadataLoader(Ample ample) { private volatile boolean serverStopRequested = false; private ServiceLock scanServerLock; protected TabletServerScanMetrics scanMetrics; + private ScanServerMetrics scanServerMetrics; private ZooCache managerLockCache; @@ -373,8 +375,9 @@ public void run() { metricsInfo.addServiceTags(getApplicationName(), clientAddress); scanMetrics = new TabletServerScanMetrics(); + scanServerMetrics = new ScanServerMetrics(tabletMetadataCache, groupName); - metricsInfo.addMetricsProducers(scanMetrics); + metricsInfo.addMetricsProducers(scanMetrics, scanServerMetrics); metricsInfo.init(); // We need to set the compaction manager so that we don't get an NPE in CompactableImpl.close @@ -657,6 +660,19 @@ private Map reserveFilesInner(Collection ex } } + @VisibleForTesting + ScanReservation reserveFilesInstrumented(Map> extents) + throws AccumuloException { + long start = System.nanoTime(); + try { + return reserveFiles(extents); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + + } + protected ScanReservation reserveFiles(Map> extents) throws AccumuloException { @@ -687,6 +703,16 @@ protected ScanReservation reserveFiles(Map> extents) return new ScanReservation(tabletsMetadata, myReservationId, failures); } + private ScanReservation reserveFilesInstrumented(long scanId) throws NoSuchScanIDException { + long start = System.nanoTime(); + try { + return reserveFiles(scanId); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + } + protected ScanReservation reserveFiles(long scanId) throws NoSuchScanIDException { var session = (ScanSession) sessionManager.getSession(scanId); if (session == null) { @@ -875,7 +901,7 @@ public InitialScan startScan(TInfo tinfo, TCredentials credentials, TKeyExtent t KeyExtent extent = getKeyExtent(textent); try (ScanReservation reservation = - reserveFiles(Map.of(extent, Collections.singletonList(range)))) { + reserveFilesInstrumented(Map.of(extent, Collections.singletonList(range)))) { if (reservation.getFailures().containsKey(textent)) { throw new NotServingTabletException(extent.toThrift()); @@ -889,7 +915,9 @@ batchTimeOut, classLoaderContext, executionHints, getScanTabletResolver(tablet), busyTimeout); return is; - + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (AccumuloException | IOException e) { LOG.error("Error starting scan", e); throw new RuntimeException(e); @@ -905,6 +933,9 @@ public ScanResult continueScan(TInfo tinfo, long scanID, long busyTimeout) try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } @@ -933,7 +964,7 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, batch.put(extent, entry.getValue()); } - try (ScanReservation reservation = reserveFiles(batch)) { + try (ScanReservation reservation = reserveFilesInstrumented(batch)) { HashMap tablets = new HashMap<>(); reservation.getTabletMetadataExtents().forEach(extent -> { @@ -950,6 +981,9 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, LOG.trace("started scan: {}", ims.getScanID()); return ims; + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (TException e) { LOG.error("Error starting scan", e); throw e; @@ -967,6 +1001,9 @@ public MultiScanResult continueMultiScan(TInfo tinfo, long scanID, long busyTime try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueMultiScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java new file mode 100644 index 00000000000..38e52a64b21 --- /dev/null +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.tserver; + +import java.util.List; + +import org.apache.accumulo.core.dataImpl.KeyExtent; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.metrics.MetricsProducer; + +import com.github.benmanes.caffeine.cache.LoadingCache; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Tag; +import io.micrometer.core.instrument.Timer; +import io.micrometer.core.instrument.binder.cache.CaffeineCacheMetrics; + +public class ScanServerMetrics implements MetricsProducer { + + private final String resourceGroup; + private Timer reservationTimer; + private Counter busyCounter; + + private final LoadingCache tabletMetadataCache; + + public ScanServerMetrics(final LoadingCache tabletMetadataCache, + String resourceGroup) { + this.tabletMetadataCache = tabletMetadataCache; + this.resourceGroup = resourceGroup; + } + + @Override + public void registerMetrics(MeterRegistry registry) { + reservationTimer = Timer.builder(MetricsProducer.METRICS_SSERVER_REGISTRATION_TIMER) + .description("Time to reserve a tablets files for scan") + .tag("resource.group", resourceGroup).register(registry); + // TODO this counter is a duplicate, already a metrics for this below the scan server... use + // that instead, but look into renaming existing one to indicate scan server + busyCounter = Counter.builder(MetricsProducer.METRICS_SSERVER_BUSY_COUNTER) + .tag("resource.group", resourceGroup) + .description("The number of scans where a busy timeout happened").register(registry); + CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, + METRICS_SSERVER_TABLET_METADATA_CACHE, List.of(Tag.of("resource.group", resourceGroup))); + } + + public Timer getReservationTimer() { + return reservationTimer; + } + + public void incrementBusy() { + busyCounter.increment(); + } +} diff --git a/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java b/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java index fdb79e1b00f..b7f64e05241 100644 --- a/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java +++ b/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java @@ -101,6 +101,10 @@ protected ScanReservation reserveFiles(long scanId) throws NoSuchScanIDException return reservation; } + @Override + ScanReservation reserveFilesInstrumented(Map> extents) { + return reservation; + } } private ThriftScanClientHandler handler; diff --git a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java index 4c8c3bea26a..0ec840dcd6f 100644 --- a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java +++ b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java @@ -101,7 +101,10 @@ public void confirmMetricsPublished() throws Exception { Set unexpectedMetrics = Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT); - Set flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS); + // add sserver as flaky until scan server included in mini tests. + Set flakyMetrics = + Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, METRICS_SSERVER_BUSY_COUNTER, + METRICS_SSERVER_REGISTRATION_TIMER, METRICS_SSERVER_TABLET_METADATA_CACHE); Map expectedMetricNames = this.getMetricFields(); flakyMetrics.forEach(expectedMetricNames::remove); // might not see these From 5dd3c2d311255a8de8949178a0343e5d074e6696 Mon Sep 17 00:00:00 2001 From: Ed Coleman Date: Wed, 1 May 2024 12:58:14 +0000 Subject: [PATCH 2/6] scan metrics renaming to remove specific service in favor of tags --- .../core/metrics/MetricsProducer.java | 27 +++++++------------ .../accumulo/tserver/ScanServerMetrics.java | 18 ++++++------- .../tserver/ThriftScanClientHandler.java | 4 +-- .../metrics/TabletServerScanMetrics.java | 10 +++---- .../accumulo/test/metrics/MetricsIT.java | 8 +++--- 5 files changed, 28 insertions(+), 39 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 917922d7892..98bf3cdeb3f 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -371,21 +371,21 @@ * * N/A * N/A - * {@value #METRICS_SSERVER_REGISTRATION_TIMER} + * {@value #METRICS_SCAN_RESERVATION_TIMER} * Timer * Time to reserve a tablets files for scan * * * N/A * N/A - * {@value #METRICS_SSERVER_BUSY_COUNTER} + * {@value #METRICS_SCAN_BUSY_TIMEOUT_COUNTER} * Counter * Count of the scans where a busy timeout happened * * * N/A * N/A - * {@value #METRICS_SSERVER_TABLET_METADATA_CACHE} + * {@value #METRICS_SCAN_TABLET_METADATA_CACHE} * Cache * scan server tablet cache metrics * @@ -439,13 +439,6 @@ * Counter * * - * - * N/A - * N/A - * {@value #METRICS_SCAN_BUSY_TIMEOUT} - * Counter - * - * * * * {i|e}_{compactionServiceName}_{executor_name}_queued @@ -627,7 +620,7 @@ public interface MetricsProducer { String METRICS_REPLICATION_PEERS = METRICS_REPLICATION_PREFIX + "peers"; String METRICS_REPLICATION_THREADS = METRICS_REPLICATION_PREFIX + "threads"; - String METRICS_SCAN_PREFIX = "accumulo.tserver.scans."; + String METRICS_SCAN_PREFIX = "accumulo.scan."; String METRICS_SCAN_TIMES = METRICS_SCAN_PREFIX + "times"; String METRICS_SCAN_OPEN_FILES = METRICS_SCAN_PREFIX + "files.open"; String METRICS_SCAN_RESULTS = METRICS_SCAN_PREFIX + "result"; @@ -635,7 +628,10 @@ public interface MetricsProducer { String METRICS_SCAN_START = METRICS_SCAN_PREFIX + "start"; String METRICS_SCAN_CONTINUE = METRICS_SCAN_PREFIX + "continue"; String METRICS_SCAN_CLOSE = METRICS_SCAN_PREFIX + "close"; - String METRICS_SCAN_BUSY_TIMEOUT = METRICS_SCAN_PREFIX + "busy.timeout"; + String METRICS_SCAN_BUSY_TIMEOUT_COUNTER = METRICS_SCAN_PREFIX + "busy.timeout.count"; + String METRICS_SCAN_RESERVATION_TIMER = METRICS_SCAN_PREFIX + "reservation.timer"; + + String METRICS_SCAN_TABLET_METADATA_CACHE = METRICS_SCAN_PREFIX + "tablet.metadata.cache"; String METRICS_TSERVER_PREFIX = "accumulo.tserver."; String METRICS_TSERVER_ENTRIES = METRICS_TSERVER_PREFIX + "entries"; @@ -660,11 +656,6 @@ public interface MetricsProducer { String METRICS_TSERVER_SCAN_RESULTS_BYTES = METRICS_TSERVER_PREFIX + "scan.results.bytes"; String METRICS_TSERVER_SCANNED_ENTRIES = METRICS_TSERVER_PREFIX + "scan.scanned.entries"; - String METRICS_SSERVER_PREFIX = "accumulo.sserver."; - String METRICS_SSERVER_REGISTRATION_TIMER = METRICS_SSERVER_PREFIX + "registration.timer"; - String METRICS_SSERVER_BUSY_COUNTER = METRICS_SSERVER_PREFIX + "busy.count"; - String METRICS_SSERVER_TABLET_METADATA_CACHE = METRICS_SSERVER_PREFIX + "tablet.metadata.cache"; - String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; String METRICS_THRIFT_IDLE = METRICS_THRIFT_PREFIX + "idle"; @@ -696,7 +687,7 @@ default Map getMetricFields() { fields.put((String) f.get(MetricsProducer.class), f.getName()); } catch (IllegalArgumentException | IllegalAccessException e) { // this shouldn't happen, but let's log it anyway - LOG.error("Error getting metric value for field: " + f.getName()); + LOG.error("Error getting metric value for field: {}", f.getName()); } } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java index 38e52a64b21..c476d46281f 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -36,7 +36,7 @@ public class ScanServerMetrics implements MetricsProducer { private final String resourceGroup; private Timer reservationTimer; - private Counter busyCounter; + private Counter busyTimeoutCount; private final LoadingCache tabletMetadataCache; @@ -48,16 +48,14 @@ public ScanServerMetrics(final LoadingCache tabletMeta @Override public void registerMetrics(MeterRegistry registry) { - reservationTimer = Timer.builder(MetricsProducer.METRICS_SSERVER_REGISTRATION_TIMER) + reservationTimer = Timer.builder(MetricsProducer.METRICS_SCAN_RESERVATION_TIMER) .description("Time to reserve a tablets files for scan") .tag("resource.group", resourceGroup).register(registry); - // TODO this counter is a duplicate, already a metrics for this below the scan server... use - // that instead, but look into renaming existing one to indicate scan server - busyCounter = Counter.builder(MetricsProducer.METRICS_SSERVER_BUSY_COUNTER) - .tag("resource.group", resourceGroup) - .description("The number of scans where a busy timeout happened").register(registry); - CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, - METRICS_SSERVER_TABLET_METADATA_CACHE, List.of(Tag.of("resource.group", resourceGroup))); + busyTimeoutCount = + Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER).tag("resource.group", resourceGroup) + .description("The number of scans where a busy timeout happened").register(registry); + CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, METRICS_SCAN_TABLET_METADATA_CACHE, + List.of(Tag.of("resource.group", resourceGroup))); } public Timer getReservationTimer() { @@ -65,6 +63,6 @@ public Timer getReservationTimer() { } public void incrementBusy() { - busyCounter.increment(); + busyTimeoutCount.increment(); } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java index d1ff17347ae..8a99b2315e4 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java @@ -284,7 +284,7 @@ protected ScanResult continueScan(TInfo tinfo, long scanID, SingleScanSession sc server.getSessionManager().removeSession(scanID); TabletBase tablet = scanSession.getTabletResolver().getTablet(scanSession.extent); if (busyTimeout > 0) { - server.getScanMetrics().incrementScanBusyTimeout(1.0D); + server.getScanMetrics().incrementBusy(1.0D); throw new ScanServerBusyException(); } else if (tablet == null || tablet.isClosed()) { throw new NotServingTabletException(scanSession.extent.toThrift()); @@ -495,7 +495,7 @@ private MultiScanResult continueMultiScan(long scanID, MultiScanSession session, } catch (CancellationException ce) { server.getSessionManager().removeSession(scanID); if (busyTimeout > 0) { - server.getScanMetrics().incrementScanBusyTimeout(1.0D); + server.getScanMetrics().incrementBusy(1.0D); throw new ScanServerBusyException(); } else { log.warn("Failed to get multiscan result", ce); diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java index 492a6a8c803..ce845d555d2 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java @@ -39,7 +39,7 @@ public class TabletServerScanMetrics implements MetricsProducer { private Counter startScanCalls; private Counter continueScanCalls; private Counter closeScanCalls; - private Counter busyTimeoutReturned; + private Counter busyTimeoutCount; private final LongAdder lookupCount = new LongAdder(); private final LongAdder queryResultCount = new LongAdder(); @@ -114,8 +114,8 @@ public void incrementCloseScan(double value) { closeScanCalls.increment(value); } - public void incrementScanBusyTimeout(double value) { - busyTimeoutReturned.increment(value); + public void incrementBusy(double value) { + busyTimeoutCount.increment(value); } @Override @@ -133,8 +133,8 @@ public void registerMetrics(MeterRegistry registry) { .description("calls to continue a scan / multiscan").register(registry); closeScanCalls = Counter.builder(METRICS_SCAN_CLOSE) .description("calls to close a scan / multiscan").register(registry); - busyTimeoutReturned = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT) - .description("times that a scan has timed out in the queue").register(registry); + busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) + .description("The number of scans where a busy timeout happened").register(registry); Gauge.builder(METRICS_TSERVER_QUERIES, this, TabletServerScanMetrics::getLookupCount) .description("Number of queries").register(registry); Gauge.builder(METRICS_TSERVER_SCAN_RESULTS, this, TabletServerScanMetrics::getQueryResultCount) diff --git a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java index 0ec840dcd6f..55622d07937 100644 --- a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java +++ b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java @@ -100,11 +100,11 @@ public void confirmMetricsPublished() throws Exception { cluster.stop(); Set unexpectedMetrics = Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, - METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT); + METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT_COUNTER); // add sserver as flaky until scan server included in mini tests. - Set flakyMetrics = - Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, METRICS_SSERVER_BUSY_COUNTER, - METRICS_SSERVER_REGISTRATION_TIMER, METRICS_SSERVER_TABLET_METADATA_CACHE); + Set flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, + METRICS_SCAN_BUSY_TIMEOUT_COUNTER, METRICS_SCAN_RESERVATION_TIMER, + METRICS_SCAN_TABLET_METADATA_CACHE); Map expectedMetricNames = this.getMetricFields(); flakyMetrics.forEach(expectedMetricNames::remove); // might not see these From 3660ecd0f7628c22300edf4ea5d1302fe00a21a0 Mon Sep 17 00:00:00 2001 From: Keith Turner Date: Thu, 2 May 2024 23:04:13 +0000 Subject: [PATCH 3/6] enable stats on scan sever tablet metadata cache --- .../src/main/java/org/apache/accumulo/tserver/ScanServer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index 813245b07d2..aab4688dcfe 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -245,7 +245,7 @@ public ScanServer(ScanServerOpts opts, String[] args) { } tabletMetadataCache = Caffeine.newBuilder().expireAfterWrite(cacheExpiration, TimeUnit.MILLISECONDS) - .scheduler(Scheduler.systemScheduler()).build(tabletMetadataLoader); + .scheduler(Scheduler.systemScheduler()).recordStats().build(tabletMetadataLoader); } delegate = newThriftScanClientHandler(new WriteTracker()); From a83ad55e1cc98939eccdfcd84a2b4463d23c8d6d Mon Sep 17 00:00:00 2001 From: Ed Coleman Date: Thu, 2 May 2024 23:19:31 +0000 Subject: [PATCH 4/6] add resource.group to sserver common tags --- .../src/main/java/org/apache/accumulo/tserver/ScanServer.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index aab4688dcfe..9238e604fa5 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -123,6 +123,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import io.micrometer.core.instrument.Tag; + public class ScanServer extends AbstractServer implements TabletScanClientService.Iface, TabletHostingServer { @@ -340,6 +342,7 @@ public void unableToMonitorLockNode(final Exception e) { // Don't use the normal ServerServices lock content, instead put the server UUID here. byte[] lockContent = (serverLockUUID.toString() + "," + groupName).getBytes(UTF_8); + // wait for 120 seconds with 5 second delay for (int i = 0; i < 120 / 5; i++) { zoo.putPersistentData(zLockPath.toString(), new byte[0], NodeExistsPolicy.SKIP); @@ -373,6 +376,7 @@ public void run() { MetricsInfo metricsInfo = getContext().getMetricsInfo(); metricsInfo.addServiceTags(getApplicationName(), clientAddress); + metricsInfo.addCommonTags(List.of(Tag.of("resource.group", groupName))); scanMetrics = new TabletServerScanMetrics(); scanServerMetrics = new ScanServerMetrics(tabletMetadataCache, groupName); From 9f5debb8d7d9eae5e4965c73bc8c4c61e04ffd8f Mon Sep 17 00:00:00 2001 From: Ed Coleman Date: Thu, 2 May 2024 23:50:54 +0000 Subject: [PATCH 5/6] remove extra resource.group tag when its added as common tag --- .../apache/accumulo/tserver/ScanServer.java | 2 +- .../accumulo/tserver/ScanServerMetrics.java | 19 +++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index 9238e604fa5..94186ed38f3 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -379,7 +379,7 @@ public void run() { metricsInfo.addCommonTags(List.of(Tag.of("resource.group", groupName))); scanMetrics = new TabletServerScanMetrics(); - scanServerMetrics = new ScanServerMetrics(tabletMetadataCache, groupName); + scanServerMetrics = new ScanServerMetrics(tabletMetadataCache); metricsInfo.addMetricsProducers(scanMetrics, scanServerMetrics); metricsInfo.init(); diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java index c476d46281f..1a516b597bb 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -18,8 +18,6 @@ */ package org.apache.accumulo.tserver; -import java.util.List; - import org.apache.accumulo.core.dataImpl.KeyExtent; import org.apache.accumulo.core.metadata.schema.TabletMetadata; import org.apache.accumulo.core.metrics.MetricsProducer; @@ -28,34 +26,27 @@ import io.micrometer.core.instrument.Counter; import io.micrometer.core.instrument.MeterRegistry; -import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; import io.micrometer.core.instrument.binder.cache.CaffeineCacheMetrics; public class ScanServerMetrics implements MetricsProducer { - private final String resourceGroup; private Timer reservationTimer; private Counter busyTimeoutCount; private final LoadingCache tabletMetadataCache; - public ScanServerMetrics(final LoadingCache tabletMetadataCache, - String resourceGroup) { + public ScanServerMetrics(final LoadingCache tabletMetadataCache) { this.tabletMetadataCache = tabletMetadataCache; - this.resourceGroup = resourceGroup; } @Override public void registerMetrics(MeterRegistry registry) { reservationTimer = Timer.builder(MetricsProducer.METRICS_SCAN_RESERVATION_TIMER) - .description("Time to reserve a tablets files for scan") - .tag("resource.group", resourceGroup).register(registry); - busyTimeoutCount = - Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER).tag("resource.group", resourceGroup) - .description("The number of scans where a busy timeout happened").register(registry); - CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, METRICS_SCAN_TABLET_METADATA_CACHE, - List.of(Tag.of("resource.group", resourceGroup))); + .description("Time to reserve a tablets files for scan").register(registry); + busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) + .description("The number of scans where a busy timeout happened").register(registry); + CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, METRICS_SCAN_TABLET_METADATA_CACHE); } public Timer getReservationTimer() { From a8235a82b8f5be1894f0cdff77c28f7ee1d3a455 Mon Sep 17 00:00:00 2001 From: Ed Coleman Date: Fri, 3 May 2024 01:22:57 +0000 Subject: [PATCH 6/6] addition renames to remove tserver from scan metrics --- .../core/metrics/MetricsProducer.java | 70 +++++++++---------- .../metrics/TabletServerScanMetrics.java | 10 +-- 2 files changed, 41 insertions(+), 39 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 98bf3cdeb3f..1bb2a1c10e4 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -314,37 +314,6 @@ * * * - * queries - * Gauge - * {@value #METRICS_TSERVER_QUERIES} - * Gauge - * - * - * - * scannedRate - * Gauge - * {@value #METRICS_TSERVER_SCANNED_ENTRIES} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * - * queryRate - * Gauge - * {@value #METRICS_TSERVER_SCAN_RESULTS} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * - * queryByteRate - * Gauge - * {@value #METRICS_TSERVER_SCAN_RESULTS_BYTES} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * * ingestRate * Gauge * {@value #METRICS_TSERVER_INGEST_MUTATIONS} @@ -439,6 +408,37 @@ * Counter * * + * + * queries + * Gauge + * {@value #METRICS_SCAN_QUERIES} + * Gauge + * + * + * + * scannedRate + * Gauge + * {@value #METRICS_SCAN_SCANNED_ENTRIES} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * + * + * queryRate + * Gauge + * {@value #METRICS_SCAN_QUERY_SCAN_RESULTS} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * + * + * queryByteRate + * Gauge + * {@value #METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * * * * {i|e}_{compactionServiceName}_{executor_name}_queued @@ -630,6 +630,10 @@ public interface MetricsProducer { String METRICS_SCAN_CLOSE = METRICS_SCAN_PREFIX + "close"; String METRICS_SCAN_BUSY_TIMEOUT_COUNTER = METRICS_SCAN_PREFIX + "busy.timeout.count"; String METRICS_SCAN_RESERVATION_TIMER = METRICS_SCAN_PREFIX + "reservation.timer"; + String METRICS_SCAN_QUERIES = METRICS_SCAN_PREFIX + "queries"; + String METRICS_SCAN_QUERY_SCAN_RESULTS = METRICS_SCAN_PREFIX + "query.results"; + String METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES = METRICS_SCAN_PREFIX + "query.results.bytes"; + String METRICS_SCAN_SCANNED_ENTRIES = METRICS_SCAN_PREFIX + "query.scanned.entries"; String METRICS_SCAN_TABLET_METADATA_CACHE = METRICS_SCAN_PREFIX + "tablet.metadata.cache"; @@ -647,14 +651,10 @@ public interface MetricsProducer { String METRICS_TSERVER_TABLETS_ONLINE = METRICS_TSERVER_PREFIX + "tablets.online"; String METRICS_TSERVER_TABLETS_OPENING = METRICS_TSERVER_PREFIX + "tablets.opening"; String METRICS_TSERVER_TABLETS_UNOPENED = METRICS_TSERVER_PREFIX + "tablets.unopened"; - String METRICS_TSERVER_QUERIES = METRICS_TSERVER_PREFIX + "queries"; String METRICS_TSERVER_TABLETS_FILES = METRICS_TSERVER_PREFIX + "tablets.files"; String METRICS_TSERVER_HOLD = METRICS_TSERVER_PREFIX + "hold"; String METRICS_TSERVER_INGEST_MUTATIONS = METRICS_TSERVER_PREFIX + "ingest.mutations"; String METRICS_TSERVER_INGEST_BYTES = METRICS_TSERVER_PREFIX + "ingest.bytes"; - String METRICS_TSERVER_SCAN_RESULTS = METRICS_TSERVER_PREFIX + "scan.results"; - String METRICS_TSERVER_SCAN_RESULTS_BYTES = METRICS_TSERVER_PREFIX + "scan.results.bytes"; - String METRICS_TSERVER_SCANNED_ENTRIES = METRICS_TSERVER_PREFIX + "scan.scanned.entries"; String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java index ce845d555d2..8e066dd7f71 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java @@ -135,15 +135,17 @@ public void registerMetrics(MeterRegistry registry) { .description("calls to close a scan / multiscan").register(registry); busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) .description("The number of scans where a busy timeout happened").register(registry); - Gauge.builder(METRICS_TSERVER_QUERIES, this, TabletServerScanMetrics::getLookupCount) + Gauge.builder(METRICS_SCAN_QUERIES, this, TabletServerScanMetrics::getLookupCount) .description("Number of queries").register(registry); - Gauge.builder(METRICS_TSERVER_SCAN_RESULTS, this, TabletServerScanMetrics::getQueryResultCount) + Gauge + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS, this, + TabletServerScanMetrics::getQueryResultCount) .description("Query rate (entries/sec)").register(registry); Gauge - .builder(METRICS_TSERVER_SCAN_RESULTS_BYTES, this, + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES, this, TabletServerScanMetrics::getQueryByteCount) .description("Query rate (bytes/sec)").register(registry); - Gauge.builder(METRICS_TSERVER_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) + Gauge.builder(METRICS_SCAN_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) .description("Scanned rate").register(registry); }