Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
9b80f91
Add SR set change and empty block detection
warku123 Dec 25, 2025
1d2de1a
Add nextMaintenanceTime judge
warku123 Jan 9, 2026
1c0c1b7
Empty Block check unit test done
warku123 Jan 16, 2026
bc87e2e
SR change unit test
warku123 Jan 16, 2026
6bece43
feat(metrics): add block transaction count histogram for empty block …
warku123 Mar 24, 2026
39dd708
Modify blank to passed the ci check
warku123 Apr 1, 2026
e92b8e2
refactor(metrics): extract MINER_LABEL constant for histogram labels
warku123 Apr 2, 2026
846e3d3
refactor(metrics): address PR review feedback
warku123 Apr 8, 2026
ff17a19
fix(metrics): wrap long line to pass checkstyle 100-char limit
warku123 Apr 8, 2026
7f351bf
refactor(metrics): reuse txCount variable to avoid repeated calls
warku123 Apr 9, 2026
7fdb3c5
fix(metrics): use HashSet and add fork rollback guard for SR set change
warku123 Apr 14, 2026
7791944
perf(metrics): skip metrics computation when prometheus is disabled
warku123 Apr 16, 2026
a12378d
refactor(metrics): move SR set change recording into MaintenanceManager
warku123 Apr 27, 2026
96bf315
docs(metrics): add cardinality comment for SR_SET_CHANGE witness label
warku123 Apr 27, 2026
033019c
refactor(metrics): rename SrSetChangeMetric to SRMetrics for extensib…
warku123 Apr 28, 2026
4ca7d6e
refactor(metrics): tighten metric placement and rename
warku123 Apr 28, 2026
d9aab3e
Merge remote-tracking branch 'upstream/develop' into metric_modify_clean
warku123 Apr 28, 2026
8ef3e91
refactor(metrics): move BLOCK_TRANSACTION_COUNT to pushBlock entry
warku123 Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ public static class Counter {
public static final String TXS = "tron:txs";
public static final String MINER = "tron:miner";
public static final String BLOCK_FORK = "tron:block_fork";
// witness label: bounded cardinality -- SR candidate pool is finite, rotation is
// infrequent (at most once per maintenance interval); kept for at-a-glance SR
// identification in dashboards rather than requiring log cross-referencing.
public static final String SR_SET_CHANGE = "tron:sr_set_change";
Comment thread
Sunny6889 marked this conversation as resolved.
public static final String P2P_ERROR = "tron:p2p_error";
public static final String P2P_DISCONNECT = "tron:p2p_disconnect";
public static final String INTERNAL_SERVICE_FAIL = "tron:internal_service_fail";
Expand Down Expand Up @@ -62,6 +66,7 @@ public static class Histogram {
public static final String MESSAGE_PROCESS_LATENCY = "tron:message_process_latency_seconds";
public static final String BLOCK_FETCH_LATENCY = "tron:block_fetch_latency_seconds";
public static final String BLOCK_RECEIVE_DELAY = "tron:block_receive_delay_seconds";
public static final String BLOCK_TRANSACTION_COUNT = "tron:block_transaction_count";

private Histogram() {
throw new IllegalStateException("Histogram");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public static class Counter {
public static final String TXS_FAIL_SIG = "sig";
public static final String TXS_FAIL_TAPOS = "tapos";
public static final String TXS_FAIL_DUP = "dup";
public static final String SR_ADD = "add";
public static final String SR_REMOVE = "remove";

private Counter() {
throw new IllegalStateException("Counter");
Expand Down Expand Up @@ -66,6 +68,7 @@ private Gauge() {

// Histogram
public static class Histogram {
public static final String MINER = "miner";
public static final String TRAFFIC_IN = "in";
public static final String TRAFFIC_OUT = "out";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class MetricsCounter {
init(MetricKeys.Counter.TXS, "tron txs info .", "type", "detail");
init(MetricKeys.Counter.MINER, "tron miner info .", "miner", "type");
init(MetricKeys.Counter.BLOCK_FORK, "tron block fork info .", "type");
init(MetricKeys.Counter.SR_SET_CHANGE, "tron sr set change .", "action", "witness");
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The witness label uses addresses. Although the Active SR pool is relatively limited (27 slots), as SRs rotate in and out over time, label values will accumulate monotonically — they only grow, never shrink. This is a known Prometheus anti-pattern.

Practical impact assessment:

  • TRON's SR candidate pool is not infinite, and rotation frequency is low, so it won't explode in the short term

  • However, as a long-running node metric, months or years of operation could still accumulate a significant number of time series

Possible approaches:

  • If the address dimension is truly needed, keep the current design, but document this characteristic

  • Or remove the witness label entirely, keeping only action (add/remove), and output address details via logs instead. Follow the principle that Prometheus should be used for alerting and awareness, while logs are for precise investigation and root-cause analysis.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for raising the high-cardinality concern — it's a valid Prometheus best practice to watch for.

However, I believe the witness label is acceptable here given TRON's specific constraints:

  1. Bounded candidate pool: The SR candidate pool is finite. Even over years of operation, the total number of distinct addresses that have ever held an SR slot has a practical ceiling — far from the
    unbounded growth typical of high-cardinality anti-patterns.
  2. Low rotation frequency: SR set changes only occur at maintenance intervals (every 6 hours), so new label values are introduced very infrequently.
  3. Diagnostic value: The primary purpose of this metric is to identify which SR was added or removed. Dropping the witness label would reduce the metric to just "something changed", requiring operators to
    cross-reference logs — losing the at-a-glance visibility in Grafana dashboards that makes this metric useful in the first place.

In summary, the worst-case time series count is roughly candidate_pool_size × 2 (add/remove), which stays well within Prometheus's comfortable operating range.

That said, I agree it's worth documenting this characteristic. Could you suggest what form you'd prefer — a code comment at the metric registration site, a note in the PR description, or something else?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefer to a code comment at the metric registration site.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@halibobo1205 Added a comment at the SR_SET_CHANGE constant definition in MetricKeys.java (commit 96bf315) explaining the cardinality rationale:

witness label: bounded cardinality -- SR candidate pool is finite, rotation is infrequent (at most once per maintenance interval); kept for at-a-glance SR identification in dashboards rather than requiring log cross-referencing.

init(MetricKeys.Counter.P2P_ERROR, "tron p2p error info .", "type");
init(MetricKeys.Counter.P2P_DISCONNECT, "tron p2p disconnect .", "type");
init(MetricKeys.Counter.INTERNAL_SERVICE_FAIL, "internal Service fail.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.JSONRPC_SERVICE_LATENCY, "JsonRpc Service latency.",
"method");
init(MetricKeys.Histogram.MINER_LATENCY, "miner latency.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.PING_PONG_LATENCY, "node ping pong latency.");
init(MetricKeys.Histogram.VERIFY_SIGN_LATENCY, "verify sign latency for trx , block.",
"type");
Expand All @@ -36,7 +36,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.PROCESS_TRANSACTION_LATENCY, "process transaction latency.",
"type", "contract");
init(MetricKeys.Histogram.MINER_DELAY, "miner delay time, actualTime - planTime.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.UDP_BYTES, "udp_bytes traffic.",
"type");
init(MetricKeys.Histogram.TCP_BYTES, "tcp_bytes traffic.",
Expand All @@ -48,6 +48,11 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.BLOCK_FETCH_LATENCY, "fetch block latency.");
init(MetricKeys.Histogram.BLOCK_RECEIVE_DELAY,
"receive block delay time, receiveTime - blockTime.");

init(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT,
"Distribution of transaction counts per block.",
new double[]{0, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
MetricLabels.Histogram.MINER);
}

private MetricsHistogram() {
Expand All @@ -62,6 +67,17 @@ private static void init(String name, String help, String... labels) {
.register());
}

private static void init(String name, String help, double[] buckets, String... labels) {
Histogram.Builder builder = Histogram.build()
.name(name)
.help(help)
.labelNames(labels);
if (buckets != null && buckets.length > 0) {
builder.buckets(buckets);
}
container.put(name, builder.register());
}

static Histogram.Timer startTimer(String key, String... labels) {
if (Metrics.enabled()) {
Histogram histogram = container.get(key);
Expand Down
26 changes: 26 additions & 0 deletions common/src/main/java/org/tron/common/prometheus/SRMetrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.tron.common.prometheus;

import com.google.protobuf.ByteString;
import java.util.List;
import org.tron.common.utils.StringUtil;

public class SRMetrics {

private SRMetrics() {
throw new IllegalStateException("SRMetrics");
}

public static void recordSrSetChange(List<ByteString> currentWits, List<ByteString> newWits) {
if (!Metrics.enabled()) {
return;
}
newWits.stream()
.filter(w -> !currentWits.contains(w))
.forEach(w -> Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_ADD, StringUtil.encode58Check(w.toByteArray())));
currentWits.stream()
.filter(w -> !newWits.contains(w))
.forEach(w -> Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_REMOVE, StringUtil.encode58Check(w.toByteArray())));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.bouncycastle.util.encoders.Hex;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.tron.common.prometheus.SRMetrics;
import org.tron.consensus.ConsensusDelegate;
import org.tron.consensus.pbft.PbftManager;
import org.tron.core.capsule.AccountCapsule;
Expand Down Expand Up @@ -141,6 +142,8 @@ public void doMaintenance() {
witnessCapsule.setIsJobs(true);
consensusDelegate.saveWitness(witnessCapsule);
});

SRMetrics.recordSrSetChange(currentWits, newWits);
}

logger.info("Update witness success. \nbefore: {} \nafter: {}",
Expand Down
5 changes: 5 additions & 0 deletions framework/src/main/java/org/tron/core/db/Manager.java
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,11 @@ public void pushBlock(final BlockCapsule block)
synchronized (this) {
Metrics.histogramObserve(blockedTimer.get());
blockedTimer.remove();
if (Metrics.enabled()) {
Metrics.histogramObserve(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT,
block.getTransactions().size(),
StringUtil.encode58Check(block.getWitnessAddress().toByteArray()));
}
long headerNumber = getDynamicPropertiesStore().getLatestBlockHeaderNumber();
if (block.getNum() <= headerNumber && khaosDb.containBlockInMiniStore(block.getBlockId())) {
logger.info("Block {} is already exist.", block.getBlockId().getString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,10 @@ public void applyBlock(BlockCapsule block) {
}

//TPS
if (block.getTransactions().size() > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, block.getTransactions().size());
Metrics.counterInc(MetricKeys.Counter.TXS, block.getTransactions().size(),
int txCount = block.getTransactions().size();
if (txCount > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, txCount);
Metrics.counterInc(MetricKeys.Counter.TXS, txCount,
MetricLabels.Counter.TXS_SUCCESS, MetricLabels.Counter.TXS_SUCCESS);
}
}
Expand Down
206 changes: 206 additions & 0 deletions framework/src/test/java/org/tron/common/prometheus/SRMetricsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
package org.tron.common.prometheus;

import com.google.protobuf.ByteString;
import io.prometheus.client.CollectorRegistry;
import java.util.Arrays;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicInteger;
import javax.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.tron.common.BaseTest;
import org.tron.common.TestConstants;
import org.tron.common.utils.StringUtil;
import org.tron.consensus.dpos.MaintenanceManager;
import org.tron.core.capsule.AccountCapsule;
import org.tron.core.capsule.VotesCapsule;
import org.tron.core.capsule.WitnessCapsule;
import org.tron.core.config.args.Args;
import org.tron.core.consensus.ConsensusService;
import org.tron.protos.Protocol;
import org.tron.protos.Protocol.Vote;

@Slf4j(topic = "metric")
public class SRMetricsTest extends BaseTest {

private static final AtomicInteger PORT = new AtomicInteger(0);
private static final AtomicInteger UNIQUE = new AtomicInteger(0);

@Resource
private MaintenanceManager maintenanceManager;
@Resource
private ConsensusService consensusService;

static {
Args.setParam(new String[]{"-d", dbPath()}, TestConstants.TEST_CONF);
Args.getInstance().setNodeListenPort(20000 + PORT.incrementAndGet());
Args.getInstance().setMetricsPrometheusEnable(true);
Metrics.init();
}

@Before
public void setUp() {
Args.getInstance().setMetricsPrometheusEnable(true);
consensusService.start();
}

@After
public void tearDown() {
Args.getInstance().setMetricsPrometheusEnable(true);
}

/**
* Drive the full maintenance flow: starting with a single active witness while WitnessStore
* contains additional ones, doMaintenance() should expand active witnesses to the full set and
* emit SR_ADD for each newly active witness.
*/
@Test
public void testSrAddViaMaintenance() {
ByteString stableWit = registerWitness();
ByteString newWit1 = registerWitness();
ByteString newWit2 = registerWitness();

chainBaseManager.getWitnessScheduleStore()
.saveActiveWitnesses(Collections.singletonList(stableWit));

seedVote(stableWit);

maintenanceManager.doMaintenance();

Assert.assertEquals(1, sample(MetricLabels.Counter.SR_ADD, newWit1).intValue());
Assert.assertEquals(1, sample(MetricLabels.Counter.SR_ADD, newWit2).intValue());
Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, stableWit));
Assert.assertNull(sample(MetricLabels.Counter.SR_REMOVE, stableWit));
}

/**
* Active witness set already matches WitnessStore → no metric emitted.
*/
@Test
public void testNoMetricWhenSetUnchanged() {
ByteString witA = registerWitness();
ByteString witB = registerWitness();

chainBaseManager.getWitnessScheduleStore()
.saveActiveWitnesses(Arrays.asList(witA, witB));

seedVote(witA);

maintenanceManager.doMaintenance();

Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, witA));
Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, witB));
Assert.assertNull(sample(MetricLabels.Counter.SR_REMOVE, witA));
Assert.assertNull(sample(MetricLabels.Counter.SR_REMOVE, witB));
}

/**
* Empty VotesStore → countVote() is empty → SR change check is skipped, even when the active
* set differs from the full witness store.
*/
@Test
public void testNoMetricWhenNoVotes() {
ByteString stableWit = registerWitness();
ByteString newWit = registerWitness();

chainBaseManager.getWitnessScheduleStore()
.saveActiveWitnesses(Collections.singletonList(stableWit));

maintenanceManager.doMaintenance();

Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, newWit));
}

/**
* Metrics disabled → record() short-circuits even though the active set changes.
*/
@Test
public void testNoMetricWhenMetricsDisabled() {
Args.getInstance().setMetricsPrometheusEnable(false);
try {
ByteString stableWit = registerWitness();
ByteString newWit = registerWitness();

chainBaseManager.getWitnessScheduleStore()
.saveActiveWitnesses(Collections.singletonList(stableWit));

seedVote(stableWit);

maintenanceManager.doMaintenance();

Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, newWit));
} finally {
Args.getInstance().setMetricsPrometheusEnable(true);
}
}

/**
* SR_REMOVE is verified by directly calling record() instead of going through doMaintenance(),
* because driving a removal through the real flow is impractical here:
*
* <p>Inside doMaintenance(), the block before SRMetrics.recordSrSetChange() iterates currentWits
* and calls setIsJobs(false) on each WitnessCapsule fetched from WitnessStore. If currentWits
* contains any address that is not present in WitnessStore, getWitness() returns null and the
* code NPEs — so SR_REMOVE cannot be triggered by simply pointing the active set at an
* "obsolete" address.
*
* <p>The only other path to SR_REMOVE is rank-based eviction: with more than
* MAX_ACTIVE_WITNESS_NUM (27) witnesses, sorting drops the lowest-ranked one. Building that
* setup just to exercise this branch is heavy and adds little value, since SR_ADD and
* SR_REMOVE share the exact same emit logic in record() — verifying SR_ADD via doMaintenance
* already proves the wiring is correct, and this direct call covers the symmetric branch.
*/
@Test
public void testSrRemoveDirect() {
ByteString stableWit = uniqueAddress();
ByteString removedWit = uniqueAddress();

SRMetrics.recordSrSetChange(
Arrays.asList(stableWit, removedWit),
Collections.singletonList(stableWit));

Assert.assertEquals(1, sample(MetricLabels.Counter.SR_REMOVE, removedWit).intValue());
Assert.assertNull(sample(MetricLabels.Counter.SR_ADD, removedWit));
Assert.assertNull(sample(MetricLabels.Counter.SR_REMOVE, stableWit));
}

private ByteString registerWitness() {
ByteString address = uniqueAddress();
chainBaseManager.getWitnessStore().put(address.toByteArray(), new WitnessCapsule(address));
chainBaseManager.addWitness(address);
chainBaseManager.getAccountStore().put(address.toByteArray(),
new AccountCapsule(Protocol.Account.newBuilder().setAddress(address).build()));
return address;
}

private void seedVote(ByteString voteFor) {
ByteString voter = uniqueAddress();
VotesCapsule votes = new VotesCapsule(voter, Collections.emptyList(),
Collections.singletonList(Vote.newBuilder()
.setVoteAddress(voteFor)
.setVoteCount(1L)
.build()));
chainBaseManager.getVotesStore().put(voter.toByteArray(), votes);
}

private ByteString uniqueAddress() {
int n = UNIQUE.incrementAndGet();
byte[] bytes = new byte[21];
bytes[0] = 0x41;
bytes[17] = (byte) ((n >> 16) & 0xFF);
bytes[18] = (byte) ((n >> 8) & 0xFF);
bytes[19] = (byte) (n & 0xFF);
bytes[20] = 0x01;
return ByteString.copyFrom(bytes);
}

private Double sample(String action, ByteString witness) {
return CollectorRegistry.defaultRegistry.getSampleValue(
MetricKeys.Counter.SR_SET_CHANGE + "_total",
new String[]{"action", "witness"},
new String[]{action, StringUtil.encode58Check(witness.toByteArray())});
}
}
Loading
Loading