From 80d5abe780b834c540b19fc32117d0c323c52970 Mon Sep 17 00:00:00 2001 From: Keith Turner Date: Fri, 12 Apr 2024 20:08:21 -0400 Subject: [PATCH 1/2] WIP add reservation related metrics to scan server Scan servers differ from in tablet servers in that they can service scans for any tablet, but must read the tablets files and reserve those files to prevent the Accumulo GC from removing them. This happens the first time a tablet is read on a scan server and involves reads and writes to the metadata table. This PR is a start at adding metrics related to reservations so that is possible to have visibility into the impact of this process on scans. --- .../core/metrics/MetricsProducer.java | 3 ++ .../apache/accumulo/tserver/ScanServer.java | 31 ++++++++++++-- .../accumulo/tserver/ScanServerMetrics.java | 42 +++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 1a8d618ce46..1c616c1d8fb 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -674,6 +674,9 @@ public interface MetricsProducer { String METRICS_TSERVER_SCAN_RESULTS_BYTES = METRICS_TSERVER_PREFIX + "scan.results.bytes"; String METRICS_TSERVER_SCANNED_ENTRIES = METRICS_TSERVER_PREFIX + "scan.scanned.entries"; + String METRICS_SSERVER_PREFIX = "accumulo.sserver."; + String METRICS_SSERVER_REGISTRATION_TIMER = METRICS_SSERVER_PREFIX + "registration.timer"; + String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; String METRICS_THRIFT_IDLE = METRICS_THRIFT_PREFIX + "idle"; diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index 5559fb148b7..488bc153785 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -200,6 +200,7 @@ private TabletMetadataLoader(Ample ample) { private volatile boolean serverStopRequested = false; private ServiceLock scanServerLock; protected TabletServerScanMetrics scanMetrics; + private ScanServerMetrics scanServerMetrics; private ZooCache managerLockCache; @@ -242,6 +243,7 @@ public ScanServer(ScanServerOpts opts, String[] args) { LOG.warn( "Tablet metadata caching less than one minute, may cause excessive scans on metadata table."); } + // TODO Would like hit rate metrics for this cache, how can that be done? tabletMetadataCache = Caffeine.newBuilder().expireAfterWrite(cacheExpiration, TimeUnit.MILLISECONDS) .scheduler(Scheduler.systemScheduler()).build(tabletMetadataLoader); @@ -379,7 +381,8 @@ public void run() { LOG.error("Error initializing metrics, metrics will not be emitted.", e1); } scanMetrics = new TabletServerScanMetrics(); - MetricsUtil.initializeProducers(scanMetrics); + scanServerMetrics = new ScanServerMetrics(); + MetricsUtil.initializeProducers(scanMetrics, scanServerMetrics); // We need to set the compaction manager so that we don't get an NPE in CompactableImpl.close @@ -662,6 +665,18 @@ private Map reserveFilesInner(Collection ex } } + private ScanReservation reserveFilesInstrumented(Map> extents) + throws AccumuloException { + long start = System.nanoTime(); + try { + return reserveFiles(extents); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + + } + protected ScanReservation reserveFiles(Map> extents) throws AccumuloException { @@ -692,6 +707,16 @@ protected ScanReservation reserveFiles(Map> extents) return new ScanReservation(tabletsMetadata, myReservationId, failures); } + private ScanReservation reserveFilesInstrumented(long scanId) throws NoSuchScanIDException { + long start = System.nanoTime(); + try { + return reserveFiles(scanId); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + } + protected ScanReservation reserveFiles(long scanId) throws NoSuchScanIDException { var session = (ScanSession) sessionManager.getSession(scanId); if (session == null) { @@ -880,7 +905,7 @@ public InitialScan startScan(TInfo tinfo, TCredentials credentials, TKeyExtent t KeyExtent extent = getKeyExtent(textent); try (ScanReservation reservation = - reserveFiles(Map.of(extent, Collections.singletonList(range)))) { + reserveFilesInstrumented(Map.of(extent, Collections.singletonList(range)))) { if (reservation.getFailures().containsKey(textent)) { throw new NotServingTabletException(extent.toThrift()); @@ -938,7 +963,7 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, batch.put(extent, entry.getValue()); } - try (ScanReservation reservation = reserveFiles(batch)) { + try (ScanReservation reservation = reserveFilesInstrumented(batch)) { HashMap tablets = new HashMap<>(); reservation.getTabletMetadataExtents().forEach(extent -> { diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java new file mode 100644 index 00000000000..238ea9e4dbf --- /dev/null +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.tserver; + +import org.apache.accumulo.core.metrics.MetricsProducer; +import org.apache.accumulo.core.metrics.MetricsUtil; + +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; + +public class ScanServerMetrics implements MetricsProducer { + + private Timer reservationTimer; + + @Override + public void registerMetrics(MeterRegistry registry) { + reservationTimer = Timer.builder(MetricsProducer.METRICS_SSERVER_REGISTRATION_TIMER) + .description("Time to reserve a tablets files for scan").tags(MetricsUtil.getCommonTags()) + .register(registry); + } + + public Timer getReservationTimer() { + return reservationTimer; + } + +} From d8b4c66e22b9efcd252e122031fc4513345e6713 Mon Sep 17 00:00:00 2001 From: Keith Turner Date: Mon, 15 Apr 2024 11:22:51 -0400 Subject: [PATCH 2/2] add scan server busy timeout metrics --- .../accumulo/core/metrics/MetricsProducer.java | 1 + .../org/apache/accumulo/tserver/ScanServer.java | 14 +++++++++++++- .../apache/accumulo/tserver/ScanServerMetrics.java | 8 ++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 1c616c1d8fb..933ef85a432 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -676,6 +676,7 @@ public interface MetricsProducer { String METRICS_SSERVER_PREFIX = "accumulo.sserver."; String METRICS_SSERVER_REGISTRATION_TIMER = METRICS_SSERVER_PREFIX + "registration.timer"; + String METRICS_SSERVER_BUSY_COUNTER = METRICS_SSERVER_PREFIX + "busy.count"; String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index 488bc153785..b0734011bb9 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -77,6 +77,7 @@ import org.apache.accumulo.core.tabletserver.thrift.ActiveScan; import org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException; import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException; +import org.apache.accumulo.core.tabletserver.thrift.ScanServerBusyException; import org.apache.accumulo.core.tabletserver.thrift.TSampleNotPresentException; import org.apache.accumulo.core.tabletserver.thrift.TSamplerConfiguration; import org.apache.accumulo.core.tabletserver.thrift.TabletScanClientService; @@ -919,7 +920,9 @@ batchTimeOut, classLoaderContext, executionHints, getScanTabletResolver(tablet), busyTimeout); return is; - + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (AccumuloException | IOException e) { LOG.error("Error starting scan", e); throw new RuntimeException(e); @@ -935,6 +938,9 @@ public ScanResult continueScan(TInfo tinfo, long scanID, long busyTimeout) try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } @@ -980,6 +986,9 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, LOG.trace("started scan: {}", ims.getScanID()); return ims; + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (TException e) { LOG.error("Error starting scan", e); throw e; @@ -997,6 +1006,9 @@ public MultiScanResult continueMultiScan(TInfo tinfo, long scanID, long busyTime try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueMultiScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java index 238ea9e4dbf..7438d5b1318 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -21,22 +21,30 @@ import org.apache.accumulo.core.metrics.MetricsProducer; import org.apache.accumulo.core.metrics.MetricsUtil; +import io.micrometer.core.instrument.Counter; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.Timer; public class ScanServerMetrics implements MetricsProducer { private Timer reservationTimer; + private Counter busyCounter; @Override public void registerMetrics(MeterRegistry registry) { reservationTimer = Timer.builder(MetricsProducer.METRICS_SSERVER_REGISTRATION_TIMER) .description("Time to reserve a tablets files for scan").tags(MetricsUtil.getCommonTags()) .register(registry); + busyCounter = Counter.builder(MetricsProducer.METRICS_SSERVER_BUSY_COUNTER) + .description("The number of scans where a busy timeout happened") + .tags(MetricsUtil.getCommonTags()).register(registry); } public Timer getReservationTimer() { return reservationTimer; } + public void incrementBusy() { + busyCounter.increment(); + } }