diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 5c8b54949cd..cc149f03ae6 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -59,6 +59,7 @@ async-task,https://github.com/smol-rs/async-task,Apache-2.0 OR MIT,Stjepan Glavi async-trait,https://github.com/dtolnay/async-trait,MIT OR Apache-2.0,David Tolnay atoi,https://github.com/pacman82/atoi-rs,MIT,Markus Klein atomic-waker,https://github.com/smol-rs/atomic-waker,Apache-2.0 OR MIT,"Stjepan Glavina , Contributors to futures-rs" +atomic_float,https://github.com/thomcc/atomic_float,Apache-2.0 OR MIT OR Unlicense,Thom Chiovoloni aws-config,https://github.com/smithy-lang/smithy-rs,Apache-2.0,"AWS Rust SDK Team , Russell Cohen " aws-credential-types,https://github.com/smithy-lang/smithy-rs,Apache-2.0,AWS Rust SDK Team aws-lc-rs,https://github.com/aws/aws-lc-rs,ISC AND (Apache-2.0 OR ISC),AWS-LibCrypto @@ -179,6 +180,8 @@ const-oid,https://github.com/RustCrypto/formats/tree/master/const-oid,Apache-2.0 const-random,https://github.com/tkaitchuck/constrandom,MIT OR Apache-2.0,Tom Kaitchuck const-random-macro,https://github.com/tkaitchuck/constrandom,MIT OR Apache-2.0,Tom Kaitchuck const_fn,https://github.com/taiki-e/const_fn,Apache-2.0 OR MIT,The const_fn Authors +const_format,https://github.com/rodrimati1992/const_format_crates,Zlib,rodrimati1992 +const_format_proc_macros,https://github.com/rodrimati1992/const_format_crates,Zlib,rodrimati1992 constant_time_eq,https://github.com/cesarb/constant_time_eq,CC0-1.0 OR MIT-0 OR Apache-2.0,Cesar Eduardo Barros convert_case,https://github.com/rutrum/convert-case,MIT,rutrum core-foundation,https://github.com/servo/core-foundation-rs,MIT OR Apache-2.0,The Servo Project Developers @@ -190,6 +193,7 @@ crc-catalog,https://github.com/akhilles/crc-catalog,MIT OR Apache-2.0,Akhil Vela crc-fast,https://github.com/awesomized/crc-fast-rust,MIT OR Apache-2.0,Don MacAskill crc32fast,https://github.com/srijs/rust-crc32fast,MIT OR Apache-2.0,"Sam Rijs , Alex Crichton " criterion-plot,https://github.com/criterion-rs/criterion.rs,Apache-2.0 OR MIT,"Jorge Aparicio , Brook Heisler " +critical-section,https://github.com/rust-embedded/critical-section,MIT OR Apache-2.0,The critical-section Authors cron,https://github.com/zslayton/cron,MIT OR Apache-2.0,Zack Slayton crossbeam-channel,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crossbeam-channel Authors crossbeam-deque,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crossbeam-deque Authors @@ -295,6 +299,7 @@ etcetera,https://github.com/lunacookies/etcetera,MIT OR Apache-2.0,The etcetera event-listener,https://github.com/smol-rs/event-listener,Apache-2.0 OR MIT,Stjepan Glavina event-listener,https://github.com/smol-rs/event-listener,Apache-2.0 OR MIT,"Stjepan Glavina , John Nunley " event-listener-strategy,https://github.com/smol-rs/event-listener-strategy,Apache-2.0 OR MIT,John Nunley +evmap,https://github.com/jonhoo/evmap,MIT OR Apache-2.0,Jon Gjengset fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fancy-regex,https://github.com/fancy-regex/fancy-regex,MIT,"Raph Levien , Robin Stocker , Keith Hall " fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel @@ -330,6 +335,7 @@ futures-sink,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futur futures-task,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futures-task Authors futures-timer,https://github.com/async-rs/futures-timer,MIT OR Apache-2.0,Alex Crichton futures-util,https://github.com/rust-lang/futures-rs,MIT OR Apache-2.0,The futures-util Authors +generator,https://github.com/Xudong-Huang/generator-rs,MIT OR Apache-2.0,Xudong Huang generic-array,https://github.com/fizyk20/generic-array,MIT,"Bartłomiej Kamiński , Aaron Trent " getrandom,https://github.com/rust-random/getrandom,MIT OR Apache-2.0,The Rand Project Developers gimli,https://github.com/gimli-rs/gimli,MIT OR Apache-2.0,The gimli Authors @@ -346,6 +352,7 @@ grok,https://github.com/mmastrac/grok,Apache-2.0,"Matt Mastracci , Jack Grigg " h2,https://github.com/hyperium/h2,MIT,"Carl Lerche , Sean McArthur " half,https://github.com/VoidStarKat/half-rs,MIT OR Apache-2.0,Kathryn Long +hashbag,https://github.com/jonhoo/hashbag,MIT OR Apache-2.0,Jon Gjengset hashbrown,https://github.com/rust-lang/hashbrown,MIT OR Apache-2.0,Amanieu d'Antras hashlink,https://github.com/kyren/hashlink,MIT OR Apache-2.0,kyren hdrhistogram,https://github.com/HdrHistogram/HdrHistogram_rust,MIT OR Apache-2.0,"Jon Gjengset , Marshall Pierce " @@ -418,12 +425,15 @@ json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McC jsonschema,https://github.com/Stranger6667/jsonschema,MIT,Dmitry Dygalo jsonwebtoken,https://github.com/Keats/jsonwebtoken,MIT,Vincent Prouillet keccak,https://github.com/RustCrypto/sponges/tree/master/keccak,Apache-2.0 OR MIT,RustCrypto Developers +konst,https://github.com/rodrimati1992/konst,Zlib,rodrimati1992 +konst_macro_rules,https://github.com/rodrimati1992/konst,Zlib,rodrimati1992 krb5-src,https://github.com/MaterializeInc/rust-krb5-src,Apache-2.0,"Materialize, Inc." lalrpop-util,https://github.com/lalrpop/lalrpop,Apache-2.0 OR MIT,Niko Matsakis lambda_runtime,https://github.com/awslabs/aws-lambda-rust-runtime,Apache-2.0,"David Calavera , Harold Sun " lambda_runtime_api_client,https://github.com/awslabs/aws-lambda-rust-runtime,Apache-2.0,"David Calavera , Harold Sun " lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2.0,Marvin Löbel leb128fmt,https://github.com/bluk/leb128fmt,MIT OR Apache-2.0,Bryant Luk +left-right,https://github.com/jonhoo/left-right,MIT OR Apache-2.0,Jon Gjengset levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel lexical-core,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh lexical-parse-float,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh @@ -445,6 +455,7 @@ linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM- litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers lock_api,https://github.com/Amanieu/parking_lot,MIT OR Apache-2.0,Amanieu d'Antras log,https://github.com/rust-lang/log,MIT OR Apache-2.0,The Rust Project Developers +loom,https://github.com/tokio-rs/loom,MIT,Carl Lerche lru,https://github.com/jeromefroe/lru-rs,MIT,Jerome Froelich lru-slab,https://github.com/Ralith/lru-slab,MIT OR Apache-2.0 OR Zlib,Benjamin Saunders lz4,https://github.com/10xGenomics/lz4-rs,MIT,"Jens Heyens , Artem V. Navrotskiy , Patrick Marks " @@ -459,6 +470,8 @@ memchr,https://github.com/BurntSushi/memchr,Unlicense OR MIT,"Andrew Gallant , Yevhenii Reizner , The Contributors" metrics,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence metrics-exporter-dogstatsd,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence +metrics-exporter-otel,https://github.com/palindrom615/metrics,MIT,Whoemoon Jang +metrics-exporter-prometheus,https://github.com/metrics-rs/metrics,MIT AND Apache-2.0,Toby Lawrence metrics-util,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence mime,https://github.com/hyperium/mime,MIT OR Apache-2.0,Sean McArthur mime_guess,https://github.com/abonander/mime_guess,MIT,Austin Bonander @@ -648,6 +661,7 @@ rand_core,https://github.com/rust-random/rand_core,MIT OR Apache-2.0,The Rand Pr rand_hc,https://github.com/rust-random/rand,MIT OR Apache-2.0,The Rand Project Developers rand_xorshift,https://github.com/rust-random/rngs,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers" rand_xoshiro,https://github.com/rust-random/rngs,MIT OR Apache-2.0,The Rand Project Developers +rapidhash,https://github.com/hoxxep/rapidhash,MIT OR Apache-2.0,Liam Gray raw-cpuid,https://github.com/gz/rust-cpuid,MIT,Gerd Zellweger rayon,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon Authors rayon-core,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon-core Authors @@ -712,6 +726,7 @@ seahash,https://gitlab.redox-os.org/redox-os/seahash,MIT,"ticki , Kornel " security-framework-sys,https://github.com/kornelski/rust-security-framework,MIT OR Apache-2.0,"Steven Fackler , Kornel " +seize,https://github.com/ibraheemdev/seize,MIT,Ibraheem Ahmed semver,https://github.com/dtolnay/semver,MIT OR Apache-2.0,David Tolnay separator,https://github.com/saghm/rust-separator,MIT,Saghm Rossi seq-macro,https://github.com/dtolnay/seq-macro,MIT OR Apache-2.0,David Tolnay diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 0a6807c3884..8b12b0809c3 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -704,6 +704,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "atomic_float" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628d228f918ac3b82fe590352cc719d30664a0c13ca3a60266fe02c7132d480a" + [[package]] name = "autocfg" version = "1.5.0" @@ -2324,6 +2330,27 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413d67b29ef1021b4d60f4aa1e925ca031751e213832b4b1d588fae623c05c60" +[[package]] +name = "const_format" +version = "0.2.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4481a617ad9a412be3b97c5d403fef8ed023103368908b9c50af598ff467cc1e" +dependencies = [ + "const_format_proc_macros", + "konst", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -2454,6 +2481,12 @@ dependencies = [ "itertools 0.13.0", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + [[package]] name = "cron" version = "0.16.0" @@ -4034,6 +4067,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "evmap" +version = "11.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8874945f036109c72242964c1174cf99434e30cfa45bf45fedc983f50046f8" +dependencies = [ + "hashbag", + "left-right", + "smallvec", +] + [[package]] name = "fail" version = "0.5.1" @@ -4414,6 +4458,21 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -4694,6 +4753,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hashbag" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7040a10f52cba493ddb09926e15d10a9d8a28043708a405931fe4c6f19fac064" + [[package]] name = "hashbrown" version = "0.12.3" @@ -5572,6 +5637,21 @@ dependencies = [ "cpufeatures 0.2.17", ] +[[package]] +name = "konst" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128133ed7824fcd73d6e7b17957c5eb7bacb885649bd8c69708b2331a10bcefb" +dependencies = [ + "konst_macro_rules", +] + +[[package]] +name = "konst_macro_rules" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37" + [[package]] name = "krb5-src" version = "0.3.4" @@ -5676,6 +5756,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "left-right" +version = "0.11.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f0c21e4c8ff95f487fb34e6f9182875f42c84cef966d29216bf115d9bba835a" +dependencies = [ + "crossbeam-utils", + "loom", + "slab", +] + [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -5860,6 +5951,19 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lru" version = "0.16.4" @@ -5985,12 +6089,12 @@ dependencies = [ [[package]] name = "metrics" -version = "0.24.3" +version = "0.24.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" +checksum = "b7cd3e9eb685089c784f5769b1197d348c7274bc20d4e1349650f63b91b6d0af" dependencies = [ - "ahash", "portable-atomic", + "rapidhash", ] [[package]] @@ -6008,11 +6112,45 @@ dependencies = [ "tracing", ] +[[package]] +name = "metrics-exporter-otel" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58b8984fa38406b80c094943c0ba90e53d5fff0aea051ff9fac96cf6940993c8" +dependencies = [ + "metrics", + "metrics-util", + "opentelemetry", + "portable-atomic", + "scc", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c0ca2990f7f78a72c4000ddce186db7d1b700477426563ee851c95ea3c0d0c4" +dependencies = [ + "base64 0.22.1", + "evmap", + "http-body-util", + "hyper 1.9.0", + "hyper-util", + "indexmap 2.14.0", + "ipnet", + "metrics", + "metrics-util", + "quanta", + "thiserror 2.0.18", + "tokio", + "tracing", +] + [[package]] name = "metrics-util" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdfb1365fea27e6dd9dc1dbc19f570198bc86914533ad639dae939635f096be4" +checksum = "55ff5c12b797ebf094dc7c1d87e905efc0329cba332f96d51db03875441012b5" dependencies = [ "aho-corasick", "crossbeam-epoch", @@ -6025,6 +6163,7 @@ dependencies = [ "radix_trie", "rand 0.9.4", "rand_xoshiro", + "rapidhash", "sketches-ddsketch 0.3.1", ] @@ -6965,6 +7104,16 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "papaya" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "997ee03cd38c01469a7046643714f0ad28880bcb9e6679ff0666e24817ca19b7" +dependencies = [ + "equivalent", + "seize", +] + [[package]] name = "papergrid" version = "0.17.0" @@ -7609,6 +7758,9 @@ name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +dependencies = [ + "critical-section", +] [[package]] name = "portable-atomic-util" @@ -8168,6 +8320,7 @@ dependencies = [ "flume 0.12.0", "futures", "quickwit-common", + "quickwit-metrics", "rand 0.10.1", "serde", "serde_json", @@ -8210,6 +8363,9 @@ dependencies = [ "itertools 0.14.0", "metrics", "metrics-exporter-dogstatsd", + "metrics-exporter-otel", + "metrics-exporter-prometheus", + "metrics-util", "numfmt", "openssl-probe 0.1.6", "opentelemetry", @@ -8226,6 +8382,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "quickwit-rest-client", "quickwit-search", @@ -8262,6 +8419,7 @@ dependencies = [ "pin-project", "quickwit-common", "quickwit-config", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "serde", @@ -8336,10 +8494,14 @@ dependencies = [ "hyper 1.9.0", "hyper-util", "itertools 0.14.0", + "metrics", + "metrics-exporter-prometheus", + "metrics-util", "pin-project", "pnet", "prometheus", "proptest", + "quickwit-metrics", "rand 0.10.1", "rayon", "regex", @@ -8411,6 +8573,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "serde", @@ -8559,6 +8722,7 @@ dependencies = [ "quickwit-config", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-storage", @@ -8610,6 +8774,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-parquet-engine", "quickwit-proto", @@ -8655,6 +8820,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-doc-mapper", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "rand_distr", @@ -8727,6 +8893,7 @@ dependencies = [ "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-proto", "quickwit-query", @@ -8759,6 +8926,7 @@ dependencies = [ "quickwit-index-management", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-query", @@ -8791,6 +8959,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-lambda-server", + "quickwit-metrics", "quickwit-proto", "quickwit-search", "quickwit-storage", @@ -8853,6 +9022,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-doc-mapper", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "quickwit-query", @@ -8880,6 +9050,30 @@ dependencies = [ "uuid", ] +[[package]] +name = "quickwit-metrics" +version = "0.8.0" +dependencies = [ + "atomic_float", + "const_format", + "criterion", + "dashmap 6.1.0", + "inventory", + "metrics", + "metrics-exporter-prometheus", + "metrics-util", + "papaya", + "proptest", + "rustc-hash", +] + +[[package]] +name = "quickwit-metrics-inventory" +version = "0.8.0" +dependencies = [ + "quickwit-metrics", +] + [[package]] name = "quickwit-opentelemetry" version = "0.8.0" @@ -8892,6 +9086,7 @@ dependencies = [ "quickwit-config", "quickwit-ingest", "quickwit-metastore", + "quickwit-metrics", "quickwit-parquet-engine", "quickwit-proto", "serde", @@ -8914,8 +9109,8 @@ dependencies = [ "parquet", "proptest", "prost 0.14.3", - "quickwit-common", "quickwit-dst", + "quickwit-metrics", "quickwit-proto", "rand 0.10.1", "regex", @@ -9044,6 +9239,7 @@ dependencies = [ "quickwit-doc-mapper", "quickwit-indexing", "quickwit-metastore", + "quickwit-metrics", "quickwit-proto", "quickwit-query", "quickwit-storage", @@ -9105,6 +9301,7 @@ dependencies = [ "quickwit-janitor", "quickwit-lambda-client", "quickwit-metastore", + "quickwit-metrics", "quickwit-opentelemetry", "quickwit-proto", "quickwit-query", @@ -9173,6 +9370,7 @@ dependencies = [ "quickwit-aws", "quickwit-common", "quickwit-config", + "quickwit-metrics", "quickwit-proto", "regex", "reqwest", @@ -9445,6 +9643,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -10319,6 +10526,16 @@ dependencies = [ "libc", ] +[[package]] +name = "seize" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "semver" version = "1.0.28" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 04244aeee40..c1d16ebff24 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -26,6 +26,8 @@ members = [ "quickwit-lambda-server", "quickwit-macros", "quickwit-metastore", + "quickwit-metrics", + "quickwit-metrics-inventory", # Disabling metastore-utils from the quickwit projects to ease build/deps. # We can reenable it when we need it. @@ -69,6 +71,8 @@ default-members = [ "quickwit-lambda-server", "quickwit-macros", "quickwit-metastore", + "quickwit-metrics", + "quickwit-metrics-inventory", "quickwit-opentelemetry", "quickwit-parquet-engine", "quickwit-proto", @@ -90,6 +94,7 @@ authors = ["Quickwit, Inc. "] license = "Apache-2.0" [workspace.dependencies] +ahash = "0.8" anyhow = "1" arc-swap = "1.8" arrow = { version = "58", default-features = false, features = ["ipc"] } @@ -97,6 +102,7 @@ assert-json-diff = "2" async-compression = { version = "0.4", features = ["tokio", "gzip"] } async-speed-limit = "0.4" async-trait = "0.1" +atomic_float = "1.1" backtrace = "0.3" base64 = "0.22" binggan = { version = "0.15" } @@ -113,8 +119,10 @@ clap = { version = "4.5", features = ["env", "string"] } coarsetime = "0.1" colored = "3.0" console-subscriber = "0.5" +const_format = "0.2" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.16" +dashmap = "6.1" dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" @@ -154,6 +162,7 @@ hyper-util = { version = "0.1", default-features = false, features = [ ] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" +inventory = "0.3" itertools = "0.14" lambda_runtime = "0.13" json_comments = "0.2" @@ -163,6 +172,9 @@ matches = "0.1" md5 = "0.8" metrics = "0.24" metrics-exporter-dogstatsd = "0.9" +metrics-exporter-otel = "0.3" +metrics-exporter-prometheus = { version = "0.18", default-features = false } +metrics-util = "0.20" mime_guess = "2.0" mini-moka = "0.10.3" mockall = "0.14" @@ -376,6 +388,8 @@ quickwit-lambda-client = { path = "quickwit-lambda-client" } quickwit-lambda-server = { path = "quickwit-lambda-server" } quickwit-macros = { path = "quickwit-macros" } quickwit-metastore = { path = "quickwit-metastore" } +quickwit-metrics = { path = "quickwit-metrics" } +quickwit-metrics-inventory = { path = "quickwit-metrics-inventory" } quickwit-opentelemetry = { path = "quickwit-opentelemetry" } quickwit-parquet-engine = { path = "quickwit-parquet-engine" } quickwit-proto = { path = "quickwit-proto" } diff --git a/quickwit/quickwit-actors/Cargo.toml b/quickwit/quickwit-actors/Cargo.toml index 7832c48e967..08c63e62add 100644 --- a/quickwit/quickwit-actors/Cargo.toml +++ b/quickwit/quickwit-actors/Cargo.toml @@ -23,6 +23,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } [features] testsuite = [] diff --git a/quickwit/quickwit-actors/src/actor_context.rs b/quickwit/quickwit-actors/src/actor_context.rs index 3186e210647..9a55a6f91eb 100644 --- a/quickwit/quickwit-actors/src/actor_context.rs +++ b/quickwit/quickwit-actors/src/actor_context.rs @@ -20,8 +20,8 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use quickwit_common::metrics::IntCounter; use quickwit_common::{KillSwitch, Progress, ProtectedZoneGuard}; +use quickwit_metrics::Counter; use tokio::sync::{oneshot, watch}; use tracing::{debug, error}; @@ -61,7 +61,7 @@ pub struct ActorContextInner { self_mailbox: Mailbox, progress: Progress, actor_state: AtomicState, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, observable_state_tx: watch::Sender, // Boolean marking the presence of an observe message in the actor's high priority queue. observe_enqueued: AtomicBool, @@ -72,7 +72,7 @@ impl ActorContext { self_mailbox: Mailbox, spawn_ctx: SpawnContext, observable_state_tx: watch::Sender, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, ) -> Self { ActorContext { inner: ActorContextInner { diff --git a/quickwit/quickwit-actors/src/mailbox.rs b/quickwit/quickwit-actors/src/mailbox.rs index f222294e4c2..cbbf6048f5d 100644 --- a/quickwit/quickwit-actors/src/mailbox.rs +++ b/quickwit/quickwit-actors/src/mailbox.rs @@ -19,7 +19,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, LazyLock, Weak}; use std::time::Instant; -use quickwit_common::metrics::{GaugeGuard, IntCounter, IntGauge}; +use quickwit_metrics::{Counter, Gauge, GaugeGuard, gauge}; use tokio::sync::oneshot; use crate::channel_with_priority::{Receiver, Sender, TrySendError}; @@ -191,7 +191,7 @@ impl Mailbox { pub async fn send_message_with_backpressure_counter( &self, message: M, - backpressure_micros_counter_opt: Option<&IntCounter>, + backpressure_micros_counter_opt: Option<&Counter>, ) -> Result, SendError> where A: DeferableReplyHandler, @@ -205,7 +205,7 @@ impl Mailbox { let now = Instant::now(); self.inner.tx.send_low_priority(envelope).await?; let elapsed = now.elapsed(); - backpressure_micros_counter.inc_by(elapsed.as_micros() as u64); + backpressure_micros_counter.increment(elapsed.as_micros() as u64); } else { self.inner.tx.send_low_priority(envelope).await?; } @@ -273,7 +273,7 @@ impl Mailbox { pub async fn ask_with_backpressure_counter( &self, message: M, - backpressure_micros_counter_opt: Option<&IntCounter>, + backpressure_micros_counter_opt: Option<&Counter>, ) -> Result> where A: DeferableReplyHandler, @@ -308,9 +308,17 @@ impl Mailbox { struct InboxInner { rx: Receiver>, - _inboxes_count_gauge_guard: GaugeGuard<'static>, + _inboxes_count_gauge_guard: GaugeGuard, } +static INBOX_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "inboxes_count", + description: "overall count of actors", + subsystem: "actor", + ) +}); + pub struct Inbox { inner: Arc>, } @@ -385,17 +393,9 @@ impl Inbox { } } -fn get_actor_inboxes_count_gauge_guard() -> GaugeGuard<'static> { - static INBOX_GAUGE: LazyLock = LazyLock::new(|| { - quickwit_common::metrics::new_gauge( - "inboxes_count", - "overall count of actors", - "actor", - &[], - ) - }); - let mut gauge_guard = GaugeGuard::from_gauge(&INBOX_GAUGE); - gauge_guard.add(1); +fn get_actor_inboxes_count_gauge_guard() -> GaugeGuard { + let gauge_guard = GaugeGuard::from_gauge(&INBOX_GAUGE); + gauge_guard.increment(1.0); gauge_guard } @@ -452,6 +452,8 @@ mod tests { use std::mem; use std::time::Duration; + use quickwit_metrics::counter; + use super::*; use crate::tests::{Ping, PingReceiverActor}; use crate::{ActorContext, ActorExitStatus, Handler, Universe}; @@ -519,8 +521,12 @@ mod tests { .await .unwrap(); // At this point the actor was started and even processed a message entirely. - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_low_backpressure", + description: "help for test_counter", + subsystem: "actor", + observable: true, + ); let wait_duration = Duration::from_millis(1); let processed = mailbox .send_message_with_backpressure_counter( @@ -546,8 +552,12 @@ mod tests { .ask_with_backpressure_counter(Duration::default(), None) .await .unwrap(); - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_backpressure", + description: "help for test_counter", + subsystem: "actor", + observable: true, + ); let wait_duration = Duration::from_millis(1); mailbox .send_message_with_backpressure_counter( @@ -578,8 +588,12 @@ mod tests { .ask_with_backpressure_counter(Duration::default(), None) .await .unwrap(); - let backpressure_micros_counter = - IntCounter::new("test_counter", "help for test_counter").unwrap(); + let backpressure_micros_counter = counter!( + name: "test_counter_no_waiting_backpressure", + description: "help for test_counter", + subsystem: "actor", + observable: true, + ); let start = Instant::now(); mailbox .ask_with_backpressure_counter(Duration::from_millis(1), None) diff --git a/quickwit/quickwit-actors/src/spawn_builder.rs b/quickwit/quickwit-actors/src/spawn_builder.rs index 6dfc1aa9155..922cfc4d71d 100644 --- a/quickwit/quickwit-actors/src/spawn_builder.rs +++ b/quickwit/quickwit-actors/src/spawn_builder.rs @@ -16,7 +16,7 @@ use std::fmt; use std::time::Duration; use anyhow::Context; -use quickwit_common::metrics::IntCounter; +use quickwit_metrics::Counter; use sync_wrapper::SyncWrapper; use tokio::sync::watch; use tracing::{debug, error, info}; @@ -91,7 +91,7 @@ pub struct SpawnBuilder { spawn_ctx: SpawnContext, #[allow(clippy::type_complexity)] mailboxes: Option<(Mailbox, Inbox)>, - backpressure_micros_counter_opt: Option, + backpressure_micros_counter_opt: Option, } impl SpawnBuilder { @@ -129,10 +129,7 @@ impl SpawnBuilder { /// /// When using `.ask` the amount of time counted may be misleading. /// (See `Mailbox::ask_with_backpressure_counter` for more details) - pub fn set_backpressure_micros_counter( - mut self, - backpressure_micros_counter: IntCounter, - ) -> Self { + pub fn set_backpressure_micros_counter(mut self, backpressure_micros_counter: Counter) -> Self { self.backpressure_micros_counter_opt = Some(backpressure_micros_counter); self } diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index ebf3ae90e8c..b9d04fefc10 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -57,10 +57,14 @@ tracing-subscriber = { workspace = true } metrics = { workspace = true } metrics-exporter-dogstatsd = { workspace = true } +metrics-exporter-otel = { workspace = true } +metrics-exporter-prometheus = { workspace = true } +metrics-util = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-dst = { workspace = true } quickwit-index-management = { workspace = true } diff --git a/quickwit/quickwit-cli/src/jemalloc.rs b/quickwit/quickwit-cli/src/jemalloc.rs index f22caff5a37..66ca7c8f4df 100644 --- a/quickwit/quickwit-cli/src/jemalloc.rs +++ b/quickwit/quickwit-cli/src/jemalloc.rs @@ -14,7 +14,6 @@ use std::time::Duration; -use quickwit_common::metrics::MEMORY_METRICS; use tikv_jemallocator::Jemalloc; use tracing::error; @@ -30,8 +29,6 @@ pub static GLOBAL: Jemalloc = Jemalloc; const JEMALLOC_METRICS_POLLING_INTERVAL: Duration = Duration::from_secs(1); pub async fn jemalloc_metrics_loop() -> tikv_jemalloc_ctl::Result<()> { - let memory_metrics = MEMORY_METRICS.clone(); - // Obtain a MIB for the `epoch`, `stats.active`, `stats.allocated`, and `stats.resident` keys: let epoch_mib = tikv_jemalloc_ctl::epoch::mib()?; let active_mib = tikv_jemalloc_ctl::stats::active::mib()?; @@ -48,13 +45,13 @@ pub async fn jemalloc_metrics_loop() -> tikv_jemalloc_ctl::Result<()> { // Read statistics using MIB keys: let active = active_mib.read()?; - memory_metrics.active_bytes.set(active as i64); + quickwit_common::metrics::MEMORY_ACTIVE_BYTES.set(active as f64); let allocated = allocated_mib.read()?; - memory_metrics.allocated_bytes.set(allocated as i64); + quickwit_common::metrics::MEMORY_ALLOCATED_BYTES.set(allocated as f64); let resident = resident_mib.read()?; - memory_metrics.resident_bytes.set(resident as i64); + quickwit_common::metrics::MEMORY_RESIDENT_BYTES.set(resident as f64); } } diff --git a/quickwit/quickwit-cli/src/lib.rs b/quickwit/quickwit-cli/src/lib.rs index 45275c3ff5d..afdde568ac3 100644 --- a/quickwit/quickwit-cli/src/lib.rs +++ b/quickwit/quickwit-cli/src/lib.rs @@ -354,8 +354,6 @@ pub mod busy_detector { use tracing::debug; - use crate::metrics::CLI_METRICS; - // we need that time reference to use an atomic and not a mutex for LAST_UNPARK static TIME_REF: LazyLock = LazyLock::new(Instant::now); static ENABLED: AtomicBool = AtomicBool::new(false); @@ -393,10 +391,7 @@ pub mod busy_detector { .unwrap_or_default(); let now = now.as_micros() as u64; let delta = now - time.load(Ordering::Relaxed); - CLI_METRICS - .thread_unpark_duration_microseconds - .with_label_values([]) - .observe(delta as f64); + crate::metrics::THREAD_UNPARK_DURATION_MICROSECONDS.record(delta as f64); if delta > ALLOWED_DELAY_MICROS { emit_debug(delta, now); } diff --git a/quickwit/quickwit-cli/src/logger.rs b/quickwit/quickwit-cli/src/logger.rs index d1c994e893b..558e168f575 100644 --- a/quickwit/quickwit-cli/src/logger.rs +++ b/quickwit/quickwit-cli/src/logger.rs @@ -17,13 +17,20 @@ use std::sync::Arc; use std::{env, fmt}; use anyhow::Context; +use metrics_exporter_dogstatsd::DogStatsDRecorder; +use metrics_exporter_otel::OpenTelemetryRecorder; +use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusRecorder}; +use metrics_util::MetricKindMask; +use metrics_util::layers::{FanoutBuilder, RouterBuilder}; +use opentelemetry::metrics::MeterProvider; use opentelemetry::trace::TracerProvider; use opentelemetry::{KeyValue, global}; use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; use opentelemetry_otlp::{ - LogExporter, Protocol as OtlpWireProtocol, SpanExporter, WithExportConfig, + LogExporter, MetricExporter, Protocol as OtlpWireProtocol, SpanExporter, WithExportConfig, }; use opentelemetry_sdk::logs::SdkLoggerProvider; +use opentelemetry_sdk::metrics::{SdkMeterProvider as SdkMetricsProvider, Temporality}; use opentelemetry_sdk::propagation::TraceContextPropagator; use opentelemetry_sdk::trace::{BatchConfigBuilder, SdkTracerProvider}; use opentelemetry_sdk::{Resource, trace}; @@ -44,6 +51,13 @@ use tracing_subscriber::registry::LookupSpan; use crate::QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER_ENV_KEY; +const OTEL_EXPORTER_OTLP_PROTOCOL_ENV_KEY: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; +const OTEL_EXPORTER_OTLP_TRACES_PROTOCOL_ENV_KEY: &str = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"; +const OTEL_EXPORTER_OTLP_LOGS_PROTOCOL_ENV_KEY: &str = "OTEL_EXPORTER_OTLP_LOGS_PROTOCOL"; +const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL_ENV_KEY: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; +const OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE_ENV_KEY: &str = + "OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE"; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum OtlpProtocol { Grpc, @@ -81,6 +95,26 @@ impl OtlpProtocol { } .context("failed to initialize OTLP traces exporter") } + + fn metric_exporter(&self, temporality: Temporality) -> anyhow::Result { + match self { + OtlpProtocol::Grpc => MetricExporter::builder() + .with_tonic() + .with_temporality(temporality) + .build(), + OtlpProtocol::HttpProtobuf => MetricExporter::builder() + .with_http() + .with_temporality(temporality) + .with_protocol(OtlpWireProtocol::HttpBinary) + .build(), + OtlpProtocol::HttpJson => MetricExporter::builder() + .with_http() + .with_temporality(temporality) + .with_protocol(OtlpWireProtocol::HttpJson) + .build(), + } + .context("failed to initialize OTLP metrics exporter") + } } impl FromStr for OtlpProtocol { @@ -104,11 +138,143 @@ impl FromStr for OtlpProtocol { } } +struct OtlpExporterConfig { + enabled: bool, + default_protocol: String, +} + +fn load_otlp_exporter_config() -> OtlpExporterConfig { + OtlpExporterConfig { + enabled: get_bool_from_env(QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER_ENV_KEY, false), + default_protocol: get_from_env( + OTEL_EXPORTER_OTLP_PROTOCOL_ENV_KEY, + "grpc".to_string(), + false, + ), + } +} + +impl OtlpExporterConfig { + fn is_enabled(&self) -> bool { + self.enabled + } + + fn traces_protocol(&self) -> anyhow::Result { + self.resolve_protocol(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL_ENV_KEY) + } + + fn logs_protocol(&self) -> anyhow::Result { + self.resolve_protocol(OTEL_EXPORTER_OTLP_LOGS_PROTOCOL_ENV_KEY) + } + + fn metrics_protocol(&self) -> anyhow::Result { + self.resolve_protocol(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL_ENV_KEY) + } + + fn metrics_temporality(&self) -> anyhow::Result { + let temporality = get_from_env_opt::( + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE_ENV_KEY, + false, + ); + temporality + .as_deref() + .map(|temporality_str| { + OtlpMetricsTemporality::from_str(temporality_str).with_context(|| { + format!( + "failed to parse environment variable \ + `{OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE_ENV_KEY}`" + ) + }) + }) + .transpose() + .map(|temporality| { + temporality + .map(Temporality::from) + .unwrap_or(Temporality::Cumulative) + }) + } + + fn resolve_protocol(&self, exporter_protocol_env_key: &str) -> anyhow::Result { + let exporter_protocol = get_from_env_opt::(exporter_protocol_env_key, false); + let (protocol, env_key) = if let Some(protocol) = exporter_protocol { + (protocol, exporter_protocol_env_key) + } else { + ( + self.default_protocol.clone(), + OTEL_EXPORTER_OTLP_PROTOCOL_ENV_KEY, + ) + }; + + OtlpProtocol::from_str(&protocol) + .with_context(|| format!("failed to parse environment variable `{env_key}`")) + } +} + +struct OtlpMetricsTemporality(Temporality); + +impl FromStr for OtlpMetricsTemporality { + type Err = anyhow::Error; + + fn from_str(temporality_str: &str) -> anyhow::Result { + const TEMPORALITY_DELTA: &str = "delta"; + const TEMPORALITY_LOWMEMORY: &str = "lowmemory"; + const TEMPORALITY_CUMULATIVE: &str = "cumulative"; + + match temporality_str { + TEMPORALITY_DELTA => Ok(Self(Temporality::Delta)), + TEMPORALITY_LOWMEMORY => Ok(Self(Temporality::LowMemory)), + TEMPORALITY_CUMULATIVE => Ok(Self(Temporality::Cumulative)), + other => anyhow::bail!( + "unsupported OTLP metrics temporality `{other}`, supported values are \ + `{TEMPORALITY_DELTA}`, `{TEMPORALITY_LOWMEMORY}` and `{TEMPORALITY_CUMULATIVE}`" + ), + } + } +} + +impl From for Temporality { + fn from(temporality: OtlpMetricsTemporality) -> Self { + temporality.0 + } +} + +pub struct TelemetryHandle { + env_filter_reload_fn: EnvFilterReloadFn, + tracer_provider: Option, + logger_provider: Option, + meter_provider: Option, +} + +impl TelemetryHandle { + pub fn env_filter_reload_fn(&self) -> EnvFilterReloadFn { + self.env_filter_reload_fn.clone() + } + + pub fn shutdown(self) -> anyhow::Result<()> { + if let Some(tracer_provider) = self.tracer_provider { + tracer_provider + .shutdown() + .context("failed to shutdown OpenTelemetry tracer provider")?; + } + if let Some(logger_provider) = self.logger_provider { + logger_provider + .shutdown() + .context("failed to shutdown OpenTelemetry logger provider")?; + } + if let Some(meter_provider) = self.meter_provider { + meter_provider + .shutdown() + .context("failed to shutdown OpenTelemetry meter provider")?; + } + Ok(()) + } +} + #[cfg(feature = "tokio-console")] use crate::QW_ENABLE_TOKIO_CONSOLE_ENV_KEY; /// Load the default logging filter from the environment. The filter can later -/// be updated using the result callback of [setup_logging_and_tracing]. +/// be updated using the result callback of [init_telemetry]. fn startup_env_filter(level: Level) -> anyhow::Result { let env_filter = env::var("RUST_LOG") .map(|_| EnvFilter::from_default_env()) @@ -119,19 +285,25 @@ fn startup_env_filter(level: Level) -> anyhow::Result { type ReloadLayer = tracing_subscriber::reload::Layer; -pub fn setup_logging_and_tracing( +pub fn init_telemetry( level: Level, ansi_colors: bool, build_info: &BuildInfo, -) -> anyhow::Result<( - EnvFilterReloadFn, - Option<(SdkTracerProvider, SdkLoggerProvider)>, -)> { +) -> anyhow::Result { + let otlp_config = load_otlp_exporter_config(); + + let meter_provider = init_metrics_provider(build_info, &otlp_config)?; + #[cfg(feature = "tokio-console")] { if get_bool_from_env(QW_ENABLE_TOKIO_CONSOLE_ENV_KEY, false) { console_subscriber::init(); - return Ok((quickwit_serve::do_nothing_env_filter_reload_fn(), None)); + return Ok(TelemetryHandle { + env_filter_reload_fn: quickwit_serve::do_nothing_env_filter_reload_fn(), + tracer_provider: None, + logger_provider: None, + meter_provider, + }); } } global::set_text_map_propagator(TraceContextPropagator::new()); @@ -160,56 +332,24 @@ pub fn setup_logging_and_tracing( reloadable_env_filter, )?; + let env_filter_reload_fn: EnvFilterReloadFn = Arc::new(move |env_filter_def: &str| { + let new_env_filter = EnvFilter::try_new(env_filter_def)?; + reload_handle.reload(new_env_filter)?; + Ok(()) + }); + // Note on disabling ANSI characters: setting the ansi boolean on event format is insufficient. // It is thus set on layers, see https://github.com/tokio-rs/tracing/issues/1817 - let provider_opt = if get_bool_from_env(QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER_ENV_KEY, false) { - let global_protocol_str = - get_from_env("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc".to_string(), false); - let global_protocol = OtlpProtocol::from_str(&global_protocol_str)?; - - let traces_protocol_opt = - get_from_env_opt::("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", false); - let traces_protocol = traces_protocol_opt - .as_deref() - .map(OtlpProtocol::from_str) - .transpose()? - .unwrap_or(global_protocol); - - let span_exporter = traces_protocol.span_exporter()?; - let span_processor = trace::BatchSpanProcessor::builder(span_exporter) - .with_batch_config( - BatchConfigBuilder::default() - // Quickwit can generate a lot of spans, especially in debug mode, and the - // default queue size of 2048 is too small. - .with_max_queue_size(32_768) - .build(), - ) - .build(); - + let telemetry_handle = if otlp_config.is_enabled() { let resource = Resource::builder() .with_service_name("quickwit") .with_attribute(KeyValue::new("service.version", build_info.version.clone())) .build(); - let logs_protocol_opt = - get_from_env_opt::("OTEL_EXPORTER_OTLP_LOGS_PROTOCOL", false); - let logs_protocol = logs_protocol_opt - .as_deref() - .map(OtlpProtocol::from_str) - .transpose()? - .unwrap_or(global_protocol); - let log_exporter = logs_protocol.log_exporter()?; - let logger_provider = SdkLoggerProvider::builder() - .with_resource(resource.clone()) - .with_batch_exporter(log_exporter) - .build(); + let tracer_provider = init_tracer_provider(&otlp_config, resource.clone())?; + let logger_provider = init_logger_provider(&otlp_config, resource)?; - let tracing_provider = opentelemetry_sdk::trace::SdkTracerProvider::builder() - .with_span_processor(span_processor) - .with_resource(resource) - .build(); - - let tracer = tracing_provider.tracer("quickwit"); + let tracer = tracer_provider.tracer("quickwit"); let telemetry_layer = tracing_opentelemetry::layer().with_tracer(tracer); // Bridge between tracing logs and otel tracing events @@ -220,27 +360,138 @@ pub fn setup_logging_and_tracing( .with(logs_otel_layer) .try_init() .context("failed to register tracing subscriber")?; - Some((tracing_provider, logger_provider)) + + TelemetryHandle { + env_filter_reload_fn, + tracer_provider: Some(tracer_provider), + logger_provider: Some(logger_provider), + meter_provider, + } } else { registry .try_init() .context("failed to register tracing subscriber")?; - None + TelemetryHandle { + env_filter_reload_fn, + tracer_provider: None, + logger_provider: None, + meter_provider, + } }; - Ok(( - Arc::new(move |env_filter_def: &str| { - let new_env_filter = EnvFilter::try_new(env_filter_def)?; - reload_handle.reload(new_env_filter)?; - Ok(()) - }), - provider_opt, - )) + Ok(telemetry_handle) +} + +fn init_tracer_provider( + otlp_config: &OtlpExporterConfig, + resource: Resource, +) -> anyhow::Result { + let traces_protocol = otlp_config.traces_protocol()?; + let span_exporter = traces_protocol.span_exporter()?; + let span_processor = trace::BatchSpanProcessor::builder(span_exporter) + .with_batch_config( + BatchConfigBuilder::default() + // Quickwit can generate a lot of spans, especially in debug mode, and the + // default queue size of 2048 is too small. + .with_max_queue_size(32_768) + .build(), + ) + .build(); + + Ok(opentelemetry_sdk::trace::SdkTracerProvider::builder() + .with_span_processor(span_processor) + .with_resource(resource) + .build()) +} + +fn init_logger_provider( + otlp_config: &OtlpExporterConfig, + resource: Resource, +) -> anyhow::Result { + let logs_protocol = otlp_config.logs_protocol()?; + let log_exporter = logs_protocol.log_exporter()?; + Ok(SdkLoggerProvider::builder() + .with_resource(resource) + .with_batch_exporter(log_exporter) + .build()) +} + +/// Set up the global metrics recorder and invariant recorder. +fn init_metrics_provider( + build_info: &BuildInfo, + otlp_config: &OtlpExporterConfig, +) -> anyhow::Result> { + let prometheus_recorder = install_prometheus_recorder()?; + + let (quickwit_recorder, meter_provider) = if otlp_config.is_enabled() { + let (otlp_recorder, meter_provider) = + install_otlp_metrics_recorder(build_info, otlp_config)?; + let recorder = FanoutBuilder::default() + .add_recorder(prometheus_recorder) + .add_recorder(otlp_recorder) + .build(); + (recorder, Some(meter_provider)) + } else { + let recorder = FanoutBuilder::default() + .add_recorder(prometheus_recorder) + .build(); + (recorder, None) + }; + + let dogstatsd_recorder = install_dogstatsd_recorder(build_info)?; + + let mut router = RouterBuilder::from_recorder(metrics::NoopRecorder); + router + .add_route(MetricKindMask::ALL, "quickwit_", quickwit_recorder) + .add_route(MetricKindMask::ALL, "pomsky.invariant.", dogstatsd_recorder); + let recorder = router.build(); + metrics::set_global_recorder(recorder) + .map_err(|_| anyhow::anyhow!("failed to install global metrics recorder"))?; + quickwit_metrics::describe_metrics(); + Ok(meter_provider) +} + +fn install_prometheus_recorder() -> anyhow::Result { + let mut prometheus_builder = PrometheusBuilder::new(); + for (name, buckets) in quickwit_metrics::histogram_buckets() { + prometheus_builder = prometheus_builder + .set_buckets_for_metric(Matcher::Full(name.to_string()), &buckets) + .with_context(|| { + format!("failed to configure Prometheus histogram buckets for `{name}`") + })?; + } + let prometheus_recorder = prometheus_builder.build_recorder(); + let prometheus_handle = prometheus_recorder.handle(); + quickwit_common::metrics::set_prometheus_handle(prometheus_handle.clone()) + .map_err(anyhow::Error::msg)?; + Ok(prometheus_recorder) } -/// Set up DogStatsD metrics exporter and invariant recorder. -#[cfg(not(test))] -pub fn setup_metrics(build_info: &BuildInfo) -> anyhow::Result<()> { +fn install_otlp_metrics_recorder( + build_info: &BuildInfo, + otlp_config: &OtlpExporterConfig, +) -> anyhow::Result<(OpenTelemetryRecorder, SdkMetricsProvider)> { + let metrics_protocol = otlp_config.metrics_protocol()?; + let temporality = otlp_config.metrics_temporality()?; + let metric_exporter = metrics_protocol.metric_exporter(temporality)?; + let resource = Resource::builder() + .with_service_name("quickwit") + .with_attribute(KeyValue::new("service.version", build_info.version.clone())) + .build(); + let metrics_provider = SdkMetricsProvider::builder() + .with_resource(resource) + .with_periodic_exporter(metric_exporter) + .build(); + let meter = metrics_provider.meter("quickwit"); + + let recorder = OpenTelemetryRecorder::new(meter); + for (name, buckets) in quickwit_metrics::histogram_buckets() { + recorder.set_histogram_bounds(&metrics::KeyName::from(name), buckets); + } + Ok((recorder, metrics_provider)) +} + +fn install_dogstatsd_recorder(build_info: &BuildInfo) -> anyhow::Result { // Reading both `CLOUDPREM_*` and `CP_*` env vars for backward compatibility. The former is // deprecated and can be removed after 2026-04-01. let host: String = quickwit_common::get_from_env_opt("CLOUDPREM_DOGSTATSD_SERVER_HOST", false) @@ -270,18 +521,17 @@ pub fn setup_metrics(build_info: &BuildInfo) -> anyhow::Result<()> { global_labels.push(::metrics::Label::new(label_key, label_val)); } } - metrics_exporter_dogstatsd::DogStatsDBuilder::default() + let recorder = metrics_exporter_dogstatsd::DogStatsDBuilder::default() .set_global_prefix("cloudprem") .with_global_labels(global_labels) .with_remote_address(addr) .context("failed to parse DogStatsD server address")? - .install() - .context("failed to register DogStatsD exporter")?; + .build() + .context("failed to build DogStatsD exporter")?; quickwit_dst::invariants::set_invariant_recorder(invariant_recorder); - Ok(()) + Ok(recorder) } -#[cfg(not(test))] fn invariant_recorder(invariant_id: quickwit_dst::invariants::InvariantId, passed: bool) { let name = invariant_id.as_str(); metrics::counter!("pomsky.invariant.checked", "invariant" => name).increment(1); @@ -553,6 +803,7 @@ pub(super) mod jemalloc_profiled { #[cfg(test)] mod tests { + use std::ffi::OsString; use std::sync::{Arc, Mutex}; use tracing_subscriber::layer::SubscriberExt; @@ -573,6 +824,121 @@ mod tests { assert!(OtlpProtocol::from_str("http/xml").is_err()); } + fn otlp_exporter_config(default_protocol: &str) -> OtlpExporterConfig { + OtlpExporterConfig { + enabled: true, + default_protocol: default_protocol.to_string(), + } + } + + struct EnvVarGuard { + key: &'static str, + previous_value: Option, + } + + impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + let guard = Self { + key, + previous_value: std::env::var_os(key), + }; + unsafe { std::env::set_var(key, value) }; + guard + } + + fn remove(key: &'static str) -> Self { + let guard = Self { + key, + previous_value: std::env::var_os(key), + }; + unsafe { std::env::remove_var(key) }; + guard + } + } + + impl Drop for EnvVarGuard { + fn drop(&mut self) { + if let Some(previous_value) = &self.previous_value { + unsafe { std::env::set_var(self.key, previous_value) }; + } else { + unsafe { std::env::remove_var(self.key) }; + } + } + } + + #[test] + fn test_otlp_exporter_config_uses_signal_specific_protocol() { + const TEST_PROTOCOL_ENV_KEY: &str = "QW_TEST_OTLP_SIGNAL_PROTOCOL"; + + let _guard = EnvVarGuard::set(TEST_PROTOCOL_ENV_KEY, "http/json"); + + assert_eq!( + otlp_exporter_config("grpc") + .resolve_protocol(TEST_PROTOCOL_ENV_KEY) + .unwrap(), + OtlpProtocol::HttpJson + ); + } + + #[test] + fn test_otlp_exporter_config_falls_back_to_default_protocol() { + const TEST_PROTOCOL_ENV_KEY: &str = "QW_TEST_OTLP_DEFAULT_PROTOCOL_FALLBACK"; + + let _guard = EnvVarGuard::remove(TEST_PROTOCOL_ENV_KEY); + + assert_eq!( + otlp_exporter_config("http/protobuf") + .resolve_protocol(TEST_PROTOCOL_ENV_KEY) + .unwrap(), + OtlpProtocol::HttpProtobuf + ); + } + + #[test] + fn test_otlp_exporter_config_signal_protocol_error_names_signal_env_var() { + const TEST_PROTOCOL_ENV_KEY: &str = "QW_TEST_OTLP_INVALID_SIGNAL_PROTOCOL"; + + let _guard = EnvVarGuard::set(TEST_PROTOCOL_ENV_KEY, "http/xml"); + + let error = otlp_exporter_config("grpc") + .resolve_protocol(TEST_PROTOCOL_ENV_KEY) + .unwrap_err(); + let error = format!("{error:#}"); + assert!(error.contains(TEST_PROTOCOL_ENV_KEY)); + assert!(error.contains("unsupported OTLP protocol `http/xml`")); + } + + #[test] + fn test_otlp_exporter_config_default_protocol_error_names_default_env_var() { + const TEST_PROTOCOL_ENV_KEY: &str = "QW_TEST_OTLP_INVALID_DEFAULT_PROTOCOL"; + + let _guard = EnvVarGuard::remove(TEST_PROTOCOL_ENV_KEY); + + let error = otlp_exporter_config("http/xml") + .resolve_protocol(TEST_PROTOCOL_ENV_KEY) + .unwrap_err(); + let error = format!("{error:#}"); + assert!(error.contains(OTEL_EXPORTER_OTLP_PROTOCOL_ENV_KEY)); + assert!(error.contains("unsupported OTLP protocol `http/xml`")); + } + + #[test] + fn test_otlp_metrics_temporality_from_str() { + assert_eq!( + Temporality::from(OtlpMetricsTemporality::from_str("delta").unwrap()), + Temporality::Delta + ); + assert_eq!( + Temporality::from(OtlpMetricsTemporality::from_str("lowmemory").unwrap()), + Temporality::LowMemory + ); + assert_eq!( + Temporality::from(OtlpMetricsTemporality::from_str("cumulative").unwrap()), + Temporality::Cumulative + ); + assert!(OtlpMetricsTemporality::from_str("invalid").is_err()); + } + /// A shared buffer writer for capturing log output in tests. #[derive(Clone, Default)] struct TestMakeWriter(Arc>>); diff --git a/quickwit/quickwit-cli/src/main.rs b/quickwit/quickwit-cli/src/main.rs index 8828398edba..6e0fe60c492 100644 --- a/quickwit/quickwit-cli/src/main.rs +++ b/quickwit/quickwit-cli/src/main.rs @@ -22,7 +22,7 @@ use quickwit_cli::checklist::RED_COLOR; use quickwit_cli::cli::{CliCommand, build_cli}; #[cfg(feature = "jemalloc")] use quickwit_cli::jemalloc::start_jemalloc_metrics_loop; -use quickwit_cli::logger::setup_logging_and_tracing; +use quickwit_cli::logger::{TelemetryHandle, init_telemetry}; use quickwit_cli::{busy_detector, install_default_crypto_ring_provider}; use quickwit_common::runtimes::scrape_tokio_runtime_metrics; use quickwit_serve::BuildInfo; @@ -40,6 +40,8 @@ fn get_main_runtime_num_threads() -> usize { } fn main() -> anyhow::Result<()> { + let (command, ansi_colors) = parse_cli_command(); + let main_runtime_num_threads: usize = get_main_runtime_num_threads(); let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -50,33 +52,24 @@ fn main() -> anyhow::Result<()> { .build() .context("failed to start main Tokio runtime")?; - scrape_tokio_runtime_metrics(rt.handle(), "main"); + rt.block_on(async move { + #[cfg(feature = "openssl-support")] + unsafe { + openssl_probe::init_openssl_env_vars() + }; + install_default_crypto_ring_provider(); - rt.block_on(main_impl()) -} + let telemetry_handle = + init_telemetry(command.default_log_level(), ansi_colors, BuildInfo::get())?; -fn register_build_info_metric() { - use itertools::Itertools; - let build_info = BuildInfo::get(); - let mut build_kvs = BTreeMap::default(); - build_kvs.insert("build_date", build_info.build_date.to_string()); - build_kvs.insert("commit_hash", build_info.commit_short_hash.to_string()); - build_kvs.insert("version", build_info.version.to_string()); - if !build_info.commit_tags.is_empty() { - let tags_str = build_info.commit_tags.iter().join(","); - build_kvs.insert("commit_tags", tags_str); - } - build_kvs.insert("target", build_info.build_target.to_string()); - quickwit_common::metrics::register_info("build_info", "Quickwit's build info", build_kvs); -} + let runtime_handle = tokio::runtime::Handle::current(); + scrape_tokio_runtime_metrics(&runtime_handle, "main"); -async fn main_impl() -> anyhow::Result<()> { - #[cfg(feature = "openssl-support")] - unsafe { - openssl_probe::init_openssl_env_vars() - }; - register_build_info_metric(); + main_impl(command, telemetry_handle).await + }) +} +fn parse_cli_command() -> (CliCommand, bool) { let about_text = about_text(); let version_text = BuildInfo::get_version_text(); @@ -91,20 +84,34 @@ async fn main_impl() -> anyhow::Result<()> { std::process::exit(1); } }; + (command, ansi_colors) +} - install_default_crypto_ring_provider(); +fn register_build_info_metric() { + use itertools::Itertools; + let build_info = BuildInfo::get(); + let mut build_kvs = BTreeMap::default(); + build_kvs.insert("build_date", build_info.build_date.to_string()); + build_kvs.insert("commit_hash", build_info.commit_short_hash.to_string()); + build_kvs.insert("version", build_info.version.to_string()); + if !build_info.commit_tags.is_empty() { + let tags_str = build_info.commit_tags.iter().join(","); + build_kvs.insert("commit_tags", tags_str); + } + build_kvs.insert("target", build_info.build_target.to_string()); + quickwit_common::metrics::register_info("build_info", "Quickwit's build info", build_kvs); +} + +async fn main_impl(command: CliCommand, telemetry_handle: TelemetryHandle) -> anyhow::Result<()> { + register_build_info_metric(); #[cfg(feature = "jemalloc")] start_jemalloc_metrics_loop(); - let build_info = BuildInfo::get(); - let (env_filter_reload_fn, tracer_provider_opt) = - setup_logging_and_tracing(command.default_log_level(), ansi_colors, build_info)?; - - #[cfg(not(test))] - quickwit_cli::logger::setup_metrics(build_info)?; - - let return_code: i32 = if let Err(command_error) = command.execute(env_filter_reload_fn).await { + let return_code: i32 = if let Err(command_error) = command + .execute(telemetry_handle.env_filter_reload_fn()) + .await + { error!(error=%command_error, "command failed"); eprintln!( "{} command failed: {:?}\n", @@ -116,14 +123,7 @@ async fn main_impl() -> anyhow::Result<()> { 0 }; - if let Some((trace_provider, logs_provider)) = tracer_provider_opt { - trace_provider - .shutdown() - .context("failed to shutdown OpenTelemetry tracer provider")?; - logs_provider - .shutdown() - .context("failed to shutdown OpenTelemetry logs provider")?; - } + telemetry_handle.shutdown()?; std::process::exit(return_code) } diff --git a/quickwit/quickwit-cli/src/metrics.rs b/quickwit/quickwit-cli/src/metrics.rs index c51d010c9ea..ae5d9647d9d 100644 --- a/quickwit/quickwit-cli/src/metrics.rs +++ b/quickwit/quickwit-cli/src/metrics.rs @@ -14,26 +14,14 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{HistogramVec, new_histogram_vec}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Histogram, histogram}; -pub struct CliMetrics { - pub thread_unpark_duration_microseconds: HistogramVec<0>, -} - -impl Default for CliMetrics { - fn default() -> Self { - CliMetrics { - thread_unpark_duration_microseconds: new_histogram_vec( - "thread_unpark_duration_microseconds", - "Duration for which a thread of the main tokio runtime is unparked.", - "cli", - &[], - [], - quickwit_common::metrics::exponential_buckets(5.0, 5.0, 5).unwrap(), - ), - } - } -} - -/// Serve counters exposes a bunch a set of metrics about the request received to quickwit. -pub static CLI_METRICS: LazyLock = LazyLock::new(CliMetrics::default); +pub(crate) static THREAD_UNPARK_DURATION_MICROSECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "thread_unpark_duration_microseconds", + description: "Duration for which a thread of the main tokio runtime is unparked.", + subsystem: "cli", + buckets: exponential_buckets(5.0, 5.0, 5).unwrap(), + ) +}); diff --git a/quickwit/quickwit-cluster/Cargo.toml b/quickwit/quickwit-cluster/Cargo.toml index ab01f587cb9..758b3af3cdb 100644 --- a/quickwit/quickwit-cluster/Cargo.toml +++ b/quickwit/quickwit-cluster/Cargo.toml @@ -29,6 +29,7 @@ tracing = { workspace = true } utoipa = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-cluster/src/grpc_gossip.rs b/quickwit/quickwit-cluster/src/grpc_gossip.rs index 0ebf9f662d2..ddd6f14c7bf 100644 --- a/quickwit/quickwit-cluster/src/grpc_gossip.rs +++ b/quickwit/quickwit-cluster/src/grpc_gossip.rs @@ -31,7 +31,6 @@ use tracing::{info, warn}; use crate::grpc_service::cluster_grpc_client; use crate::member::NodeStateExt; -use crate::metrics::CLUSTER_METRICS; const MAX_GOSSIP_PEERS: usize = 3; @@ -108,7 +107,7 @@ async fn perform_grpc_gossip_rounds( warn!("failed to fetch cluster state from node `{node_id}`"); continue; }; - CLUSTER_METRICS.grpc_gossip_rounds_total.inc(); + crate::metrics::GRPC_GOSSIP_ROUNDS_TOTAL.increment(1); let mut chitchat_guard = chitchat.lock().await; diff --git a/quickwit/quickwit-cluster/src/lib.rs b/quickwit/quickwit-cluster/src/lib.rs index 0f2dbebf749..0387b4e5123 100644 --- a/quickwit/quickwit-cluster/src/lib.rs +++ b/quickwit/quickwit-cluster/src/lib.rs @@ -31,10 +31,10 @@ use chitchat::transport::{Socket, Transport, UdpSocket}; use chitchat::{ChitchatMessage, Serializable}; pub use chitchat::{FailureDetectorConfig, KeyChangeEvent, ListenerHandle}; pub use grpc_service::cluster_grpc_server; -use quickwit_common::metrics::IntCounter; use quickwit_common::tower::ClientGrpcConfig; use quickwit_config::service::QuickwitService; use quickwit_config::{GrpcConfig, NodeConfig, TlsConfig}; +use quickwit_metrics::Counter; use quickwit_proto::indexing::CpuCapacity; use quickwit_proto::ingest::ingester::IngesterStatus; use quickwit_proto::tonic::transport::{Certificate, ClientTlsConfig, Identity}; @@ -74,10 +74,10 @@ struct CountingUdpTransport; struct CountingUdpSocket { socket: UdpSocket, - gossip_recv: IntCounter, - gossip_recv_bytes: IntCounter, - gossip_send: IntCounter, - gossip_send_bytes: IntCounter, + gossip_recv: Counter, + gossip_recv_bytes: Counter, + gossip_send: Counter, + gossip_send_bytes: Counter, } #[async_trait] @@ -85,16 +85,16 @@ impl Socket for CountingUdpSocket { async fn send(&mut self, to: SocketAddr, msg: ChitchatMessage) -> anyhow::Result<()> { let msg_len = msg.serialized_len() as u64; self.socket.send(to, msg).await?; - self.gossip_send.inc(); - self.gossip_send_bytes.inc_by(msg_len); + self.gossip_send.increment(1); + self.gossip_send_bytes.increment(msg_len); Ok(()) } async fn recv(&mut self) -> anyhow::Result<(SocketAddr, ChitchatMessage)> { let (socket_addr, msg) = self.socket.recv().await?; - self.gossip_recv.inc(); + self.gossip_recv.increment(1); let msg_len = msg.serialized_len() as u64; - self.gossip_recv_bytes.inc_by(msg_len); + self.gossip_recv_bytes.increment(msg_len); Ok((socket_addr, msg)) } } @@ -105,18 +105,10 @@ impl Transport for CountingUdpTransport { let socket = UdpSocket::open(listen_addr).await?; Ok(Box::new(CountingUdpSocket { socket, - gossip_recv: crate::metrics::CLUSTER_METRICS - .gossip_recv_messages_total - .clone(), - gossip_recv_bytes: crate::metrics::CLUSTER_METRICS - .gossip_recv_bytes_total - .clone(), - gossip_send: crate::metrics::CLUSTER_METRICS - .gossip_sent_messages_total - .clone(), - gossip_send_bytes: crate::metrics::CLUSTER_METRICS - .gossip_sent_bytes_total - .clone(), + gossip_recv: crate::metrics::GOSSIP_RECV_MESSAGES_TOTAL.clone(), + gossip_recv_bytes: crate::metrics::GOSSIP_RECV_BYTES_TOTAL.clone(), + gossip_send: crate::metrics::GOSSIP_SENT_MESSAGES_TOTAL.clone(), + gossip_send_bytes: crate::metrics::GOSSIP_SENT_BYTES_TOTAL.clone(), })) } } diff --git a/quickwit/quickwit-cluster/src/metrics.rs b/quickwit/quickwit-cluster/src/metrics.rs index a5ac5d4a9ef..ab6fffe66af 100644 --- a/quickwit/quickwit-cluster/src/metrics.rs +++ b/quickwit/quickwit-cluster/src/metrics.rs @@ -18,106 +18,106 @@ use std::sync::{LazyLock, Weak}; use std::time::Duration; use chitchat::{Chitchat, ChitchatId}; -use quickwit_common::metrics::{IntCounter, IntGauge, new_counter, new_gauge}; +use quickwit_metrics::{Counter, Gauge, counter, gauge}; use tokio::sync::Mutex; use crate::member::NodeStateExt; -pub struct ClusterMetrics { - pub live_nodes: IntGauge, - pub ready_nodes: IntGauge, - pub zombie_nodes: IntGauge, - pub dead_nodes: IntGauge, - pub cluster_state_size_bytes: IntGauge, - pub node_state_size_bytes: IntGauge, - pub node_state_keys: IntGauge, - pub gossip_recv_messages_total: IntCounter, - pub gossip_recv_bytes_total: IntCounter, - pub gossip_sent_messages_total: IntCounter, - pub gossip_sent_bytes_total: IntCounter, - pub grpc_gossip_rounds_total: IntCounter, -} - -impl Default for ClusterMetrics { - fn default() -> Self { - ClusterMetrics { - live_nodes: new_gauge( - "live_nodes", - "The number of live nodes observed locally.", - "cluster", - &[], - ), - ready_nodes: new_gauge( - "ready_nodes", - "The number of ready nodes observed locally.", - "cluster", - &[], - ), - zombie_nodes: new_gauge( - "zombie_nodes", - "The number of zombie nodes observed locally.", - "cluster", - &[], - ), - dead_nodes: new_gauge( - "dead_nodes", - "The number of dead nodes observed locally.", - "cluster", - &[], - ), - cluster_state_size_bytes: new_gauge( - "cluster_state_size_bytes", - "The size of the cluster state in bytes.", - "cluster", - &[], - ), - node_state_keys: new_gauge( - "node_state_keys", - "The number of keys in the node state.", - "cluster", - &[], - ), - node_state_size_bytes: new_gauge( - "node_state_size_bytes", - "The size of the node state in bytes.", - "cluster", - &[], - ), - gossip_recv_messages_total: new_counter( - "gossip_recv_messages_total", - "Total number of gossip messages received.", - "cluster", - &[], - ), - gossip_recv_bytes_total: new_counter( - "gossip_recv_bytes_total", - "Total amount of gossip data received in bytes.", - "cluster", - &[], - ), - gossip_sent_messages_total: new_counter( - "gossip_sent_messages_total", - "Total number of gossip messages sent.", - "cluster", - &[], - ), - gossip_sent_bytes_total: new_counter( - "gossip_sent_bytes_total", - "Total amount of gossip data sent in bytes.", - "cluster", - &[], - ), - grpc_gossip_rounds_total: new_counter( - "grpc_gossip_rounds_total", - "Total number of gRPC gossip rounds performed with peer nodes.", - "cluster", - &[], - ), - } - } -} - -pub static CLUSTER_METRICS: LazyLock = LazyLock::new(ClusterMetrics::default); +pub(crate) static LIVE_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "live_nodes", + description: "The number of live nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static READY_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "ready_nodes", + description: "The number of ready nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static ZOMBIE_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "zombie_nodes", + description: "The number of zombie nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static DEAD_NODES: LazyLock = LazyLock::new(|| { + gauge!( + name: "dead_nodes", + description: "The number of dead nodes observed locally.", + subsystem: "cluster", + ) +}); + +pub(crate) static CLUSTER_STATE_SIZE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "cluster_state_size_bytes", + description: "The size of the cluster state in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static NODE_STATE_KEYS: LazyLock = LazyLock::new(|| { + gauge!( + name: "node_state_keys", + description: "The number of keys in the node state.", + subsystem: "cluster", + ) +}); + +pub(crate) static NODE_STATE_SIZE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "node_state_size_bytes", + description: "The size of the node state in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_RECV_MESSAGES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_recv_messages_total", + description: "Total number of gossip messages received.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_RECV_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_recv_bytes_total", + description: "Total amount of gossip data received in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_SENT_MESSAGES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_sent_messages_total", + description: "Total number of gossip messages sent.", + subsystem: "cluster", + ) +}); + +pub(crate) static GOSSIP_SENT_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gossip_sent_bytes_total", + description: "Total amount of gossip data sent in bytes.", + subsystem: "cluster", + ) +}); + +pub(crate) static GRPC_GOSSIP_ROUNDS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "grpc_gossip_rounds_total", + description: "Total number of gRPC gossip rounds performed with peer nodes.", + subsystem: "cluster", + ) +}); pub(crate) fn spawn_metrics_task( weak_chitchat: Weak>, @@ -155,24 +155,18 @@ pub(crate) fn spawn_metrics_task( cluster_state_size_bytes += chitchat_id_size_bytes + node_state_size_bytes; if *chitchat_id == self_chitchat_id { - CLUSTER_METRICS - .node_state_keys - .set(node_state.num_key_values() as i64); - CLUSTER_METRICS - .node_state_size_bytes - .set(node_state_size_bytes as i64); + NODE_STATE_KEYS.set(node_state.num_key_values() as f64); + NODE_STATE_SIZE_BYTES.set(node_state_size_bytes as f64); } } drop(chitchat_guard); - CLUSTER_METRICS.live_nodes.set(num_live_nodes as i64); - CLUSTER_METRICS.ready_nodes.set(num_ready_nodes as i64); - CLUSTER_METRICS.zombie_nodes.set(num_zombie_nodes as i64); - CLUSTER_METRICS.dead_nodes.set(num_dead_nodes as i64); + LIVE_NODES.set(num_live_nodes as f64); + READY_NODES.set(num_ready_nodes as f64); + ZOMBIE_NODES.set(num_zombie_nodes as f64); + DEAD_NODES.set(num_dead_nodes as f64); - CLUSTER_METRICS - .cluster_state_size_bytes - .set(cluster_state_size_bytes as i64); + CLUSTER_STATE_SIZE_BYTES.set(cluster_state_size_bytes as f64); } }; tokio::spawn(future); diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml index 14c05e19c5e..7c7dc3cfbef 100644 --- a/quickwit/quickwit-common/Cargo.toml +++ b/quickwit/quickwit-common/Cargo.toml @@ -28,9 +28,12 @@ http = { workspace = true } hyper = { workspace = true } hyper-util = { workspace = true, optional = true } itertools = { workspace = true } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } pin-project = { workspace = true } pnet = { workspace = true } prometheus = { workspace = true } +quickwit-metrics = { workspace = true } rand = { workspace = true } rayon = { workspace = true } regex = { workspace = true } @@ -62,6 +65,7 @@ jemalloc-profiled = [ [dev-dependencies] hyper-util = { workspace = true } +metrics-util = { workspace = true } proptest = { workspace = true } serde_json = { workspace = true } serial_test = { workspace = true } diff --git a/quickwit/quickwit-common/src/io.rs b/quickwit/quickwit-common/src/io.rs index e1d9ad796f1..c37e4921db1 100644 --- a/quickwit/quickwit-common/src/io.rs +++ b/quickwit/quickwit-common/src/io.rs @@ -34,10 +34,10 @@ use async_speed_limit::clock::StandardClock; use async_speed_limit::limiter::Consume; use bytesize::ByteSize; use pin_project::pin_project; -use prometheus::IntCounter; +use quickwit_metrics::{Counter, Labels, counter}; use tokio::io::AsyncWrite; -use crate::metrics::{IntCounterVec, new_counter_vec}; +use crate::metrics::MaybeRegisteredCounter; use crate::{KillSwitch, Progress, ProtectedZoneGuard}; // Max 1MB at a time. @@ -48,25 +48,16 @@ fn truncate_bytes(bytes: &[u8]) -> &[u8] { &bytes[..num_bytes] } -struct IoMetrics { - write_bytes: IntCounterVec<1>, -} - -impl Default for IoMetrics { - fn default() -> Self { - let write_bytes = new_counter_vec( - "write_bytes", - "Number of bytes written by a given component in [indexer, merger, deleter, \ - split_downloader_{merge,delete}]", - "", - &[], - ["component"], - ); - Self { write_bytes } - } -} +static WRITE_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "write_bytes", + description: "Number of bytes written by a given component in [indexer, merger, deleter, split_downloader_{merge,delete}]", + subsystem: "", + observable: true, + ) +}); -static IO_METRICS: LazyLock = LazyLock::new(IoMetrics::default); +const COMPONENT_LABELS: Labels<1> = Labels::new(["component"]); /// Parameter used in `async_speed_limit`. /// @@ -88,27 +79,14 @@ pub fn limiter(throughput: ByteSize) -> Limiter { .build() } -#[derive(Clone)] +#[derive(Clone, Default)] pub struct IoControls { throughput_limiter_opt: Option, - bytes_counter: IntCounter, + bytes_counter: MaybeRegisteredCounter, progress: Progress, kill_switch: KillSwitch, } -impl Default for IoControls { - fn default() -> Self { - let default_bytes_counter = - IntCounter::new("default_write_num_bytes", "Default write counter.").unwrap(); - IoControls { - throughput_limiter_opt: None, - progress: Progress::default(), - kill_switch: KillSwitch::default(), - bytes_counter: default_bytes_counter, - } - } -} - impl IoControls { #[must_use] pub fn progress(&self) -> &Progress { @@ -132,7 +110,9 @@ impl IoControls { } pub fn set_component(mut self, component: &str) -> Self { - self.bytes_counter = IO_METRICS.write_bytes.with_label_values([component]); + let labels = COMPONENT_LABELS.with_values([component.to_string()]); + self.bytes_counter = + MaybeRegisteredCounter::registered(counter!(parent: &*WRITE_BYTES, labels: &labels)); self } @@ -148,8 +128,8 @@ impl IoControls { self } - pub fn set_bytes_counter(mut self, bytes_counter: IntCounter) -> Self { - self.bytes_counter = bytes_counter; + pub fn set_bytes_counter(mut self, bytes_counter: Counter) -> Self { + self.bytes_counter = MaybeRegisteredCounter::registered(bytes_counter); self } @@ -167,7 +147,7 @@ impl IoControls { if let Some(throughput_limiter) = &self.throughput_limiter_opt { throughput_limiter.blocking_consume(num_bytes); } - self.bytes_counter.inc_by(num_bytes as u64); + self.bytes_counter.increment(num_bytes as u64); Ok(()) } } @@ -220,7 +200,7 @@ impl ControlledWrite { let len = *obj.as_ref().unwrap_or(&0); if len > 0 { let waiter = this.io_controls_access.apply(|io_controls| { - io_controls.bytes_counter.inc_by(len as u64); + io_controls.bytes_counter.increment(len as u64); io_controls .throughput_limiter_opt .as_ref() diff --git a/quickwit/quickwit-common/src/metrics.rs b/quickwit/quickwit-common/src/metrics.rs index 193def5e01a..54f1b14bfac 100644 --- a/quickwit/quickwit-common/src/metrics.rs +++ b/quickwit/quickwit-common/src/metrics.rs @@ -12,442 +12,299 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, HashMap}; -use std::sync::{LazyLock, OnceLock}; +use std::collections::BTreeMap; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, LazyLock, OnceLock}; +#[cfg(not(test))] +use std::time::Duration; -use prometheus::{Gauge, HistogramOpts, Opts, TextEncoder}; -pub use prometheus::{ - Histogram, HistogramTimer, HistogramVec as PrometheusHistogramVec, IntCounter, - IntCounterVec as PrometheusIntCounterVec, IntGauge, IntGaugeVec as PrometheusIntGaugeVec, - exponential_buckets, linear_buckets, -}; +use metrics_exporter_prometheus::PrometheusHandle; +pub use prometheus::{exponential_buckets, linear_buckets}; +use quickwit_metrics::{Counter, Gauge, Labels, gauge}; -#[derive(Clone)] -pub struct HistogramVec { - underlying: PrometheusHistogramVec, -} +static PROMETHEUS_HANDLE: OnceLock = OnceLock::new(); -impl HistogramVec { - pub fn with_label_values(&self, label_values: [&str; N]) -> Histogram { - self.underlying.with_label_values(&label_values) - } +#[derive(Clone)] +pub struct MaybeRegisteredCounter { + inner: MaybeRegisteredCounterInner, } #[derive(Clone)] -pub struct IntCounterVec { - underlying: PrometheusIntCounterVec, +enum MaybeRegisteredCounterInner { + Local(Arc), + Registered(Counter), } -impl IntCounterVec { - pub fn new( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], - ) -> IntCounterVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let counter_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let underlying = PrometheusIntCounterVec::new(counter_opts, &label_names) - .expect("failed to create counter vec"); - IntCounterVec { underlying } +impl Default for MaybeRegisteredCounter { + fn default() -> Self { + Self::local() } +} - pub fn with_label_values(&self, label_values: [&str; N]) -> IntCounter { - self.underlying.with_label_values(&label_values) +impl MaybeRegisteredCounter { + pub fn local() -> Self { + Self { + inner: MaybeRegisteredCounterInner::Local(Arc::new(AtomicU64::new(0))), + } } -} -#[derive(Clone)] -pub struct IntGaugeVec { - underlying: PrometheusIntGaugeVec, -} + pub fn registered(counter: Counter) -> Self { + Self { + inner: MaybeRegisteredCounterInner::Registered(counter), + } + } -impl IntGaugeVec { - pub fn with_label_values(&self, label_values: [&str; N]) -> IntGauge { - self.underlying.with_label_values(&label_values) + pub fn increment(&self, value: u64) { + match &self.inner { + MaybeRegisteredCounterInner::Local(counter) => { + counter.fetch_add(value, Ordering::Relaxed); + } + MaybeRegisteredCounterInner::Registered(counter) => counter.increment(value), + } } -} -pub fn register_info(name: &'static str, help: &'static str, kvs: BTreeMap<&'static str, String>) { - let mut counter_opts = Opts::new(name, help).namespace("quickwit"); - for (k, v) in kvs { - counter_opts = counter_opts.const_label(k, v); + pub fn get(&self) -> u64 { + match &self.inner { + MaybeRegisteredCounterInner::Local(counter) => counter.load(Ordering::Relaxed), + MaybeRegisteredCounterInner::Registered(counter) => counter.get(), + } } - let counter = IntCounter::with_opts(counter_opts).expect("failed to create counter"); - counter.inc(); - prometheus::register(Box::new(counter)).expect("failed to register counter"); } -pub fn new_counter( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> IntCounter { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let counter_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let counter = IntCounter::with_opts(counter_opts).expect("failed to create counter"); - prometheus::register(Box::new(counter.clone())).expect("failed to register counter"); - counter +pub fn set_prometheus_handle(handle: PrometheusHandle) -> Result<(), String> { + #[cfg(not(test))] + let upkeep_handle = handle.clone(); + PROMETHEUS_HANDLE + .set(handle) + .map_err(|_| "Prometheus metrics renderer is already installed".to_string())?; + #[cfg(not(test))] + spawn_prometheus_upkeep(upkeep_handle)?; + Ok(()) } -pub fn new_counter_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], -) -> IntCounterVec { - let int_counter_vec = IntCounterVec::new(name, help, subsystem, const_labels, label_names); - let collector = Box::new(int_counter_vec.underlying.clone()); - prometheus::register(collector).expect("failed to register counter vec"); - int_counter_vec +pub fn metrics_text_payload() -> Result { + let handle = PROMETHEUS_HANDLE + .get() + .ok_or_else(|| "Prometheus metrics rendering is not installed yet".to_string())?; + Ok(handle.render()) } -pub fn new_float_gauge( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> Gauge { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let gauge = Gauge::with_opts(gauge_opts).expect("failed to create float gauge"); - prometheus::register(Box::new(gauge.clone())).expect("failed to register float gauge"); - gauge +#[cfg(not(test))] +fn spawn_prometheus_upkeep(handle: PrometheusHandle) -> Result<(), String> { + // Quickwit serves the existing `/metrics` route itself, so we build only the + // Prometheus recorder instead of using the exporter's HTTP listener. That lower-level + // API does not spawn the upkeep task that periodically drains histogram buffers. + std::thread::Builder::new() + .name("metrics-exporter-prometheus-upkeep".to_string()) + .spawn(move || { + loop { + std::thread::sleep(Duration::from_secs(5)); + handle.run_upkeep(); + } + }) + .map(|_| ()) + .map_err(|error| format!("failed to spawn Prometheus metrics upkeep thread: {error}")) } -pub fn new_gauge( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], -) -> IntGauge { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let gauge = IntGauge::with_opts(gauge_opts).expect("failed to create gauge"); - prometheus::register(Box::new(gauge.clone())).expect("failed to register gauge"); - gauge +pub fn register_info(name: &'static str, help: &'static str, kvs: BTreeMap<&'static str, String>) { + let key_name = format!("quickwit_{name}"); + let labels = kvs + .into_iter() + .map(|(label, value)| metrics::Label::new(label, value)) + .collect::>(); + let key = metrics::Key::from_parts(key_name.clone(), labels); + let metadata = metrics::Metadata::new("", metrics::Level::INFO, Some(module_path!())); + metrics::with_recorder(|recorder| { + recorder.describe_counter(metrics::KeyName::from(key_name), None, help.into()); + recorder.register_counter(&key, &metadata).increment(1); + }); } -pub fn new_gauge_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], -) -> IntGaugeVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let gauge_opts = Opts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels); - let underlying = - PrometheusIntGaugeVec::new(gauge_opts, &label_names).expect("failed to create gauge vec"); - - let collector = Box::new(underlying.clone()); - prometheus::register(collector).expect("failed to register counter vec"); - - IntGaugeVec { underlying } -} +pub fn index_label(index_id: &str) -> &str { + static PER_INDEX_METRICS_ENABLED: LazyLock = + LazyLock::new(|| !crate::get_bool_from_env("QW_DISABLE_PER_INDEX_METRICS", false)); -pub fn new_histogram(name: &str, help: &str, subsystem: &str, buckets: Vec) -> Histogram { - let histogram_opts = HistogramOpts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .buckets(buckets); - let histogram = Histogram::with_opts(histogram_opts).expect("failed to create histogram"); - prometheus::register(Box::new(histogram.clone())).expect("failed to register histogram"); - histogram + if *PER_INDEX_METRICS_ENABLED { + index_id + } else { + "__any__" + } } -pub fn new_histogram_vec( - name: &str, - help: &str, - subsystem: &str, - const_labels: &[(&str, &str)], - label_names: [&str; N], - buckets: Vec, -) -> HistogramVec { - let owned_const_labels: HashMap = const_labels - .iter() - .map(|(label_name, label_value)| (label_name.to_string(), label_value.to_string())) - .collect(); - let histogram_opts = HistogramOpts::new(name, help) - .namespace("quickwit") - .subsystem(subsystem) - .const_labels(owned_const_labels) - .buckets(buckets); - let underlying = PrometheusHistogramVec::new(histogram_opts, &label_names) - .expect("failed to create histogram vec"); - - let collector = Box::new(underlying.clone()); - prometheus::register(collector).expect("failed to register histogram vec"); - - HistogramVec { underlying } -} +pub static MEMORY_ACTIVE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "active_bytes", + description: "Total number of bytes in active pages allocated by the application, as reported by jemalloc `stats.active`.", + subsystem: "memory", + ) +}); -pub struct GaugeGuard<'a> { - gauge: &'a IntGauge, - delta: i64, -} +pub static MEMORY_ALLOCATED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "allocated_bytes", + description: "Total number of bytes allocated by the application, as reported by jemalloc `stats.allocated`.", + subsystem: "memory", + ) +}); -impl std::fmt::Debug for GaugeGuard<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - self.delta.fmt(f) - } -} +pub static MEMORY_RESIDENT_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "resident_bytes", + description: " Total number of bytes in physically resident data pages mapped by the allocator, as reported by jemalloc `stats.resident`.", + subsystem: "memory", + ) +}); -impl<'a> GaugeGuard<'a> { - pub fn from_gauge(gauge: &'a IntGauge) -> Self { - Self { gauge, delta: 0i64 } - } +static IN_FLIGHT_DATA_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "in_flight_data_bytes", + description: "Amount of data in-flight in various buffers in bytes.", + subsystem: "memory", + ) +}); - pub fn get(&self) -> i64 { - self.delta - } +const COMPONENT_LABELS: Labels<1> = Labels::new(["component"]); - pub fn add(&mut self, delta: i64) { - self.gauge.add(delta); - self.delta += delta; - } +pub static IN_FLIGHT_REST_SERVER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("rest_server")); - pub fn sub(&mut self, delta: i64) { - self.gauge.sub(delta); - self.delta -= delta; - } -} +pub static IN_FLIGHT_INGEST_ROUTER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingest_router")); -impl Drop for GaugeGuard<'_> { - fn drop(&mut self) { - self.gauge.sub(self.delta) - } -} +pub static IN_FLIGHT_INGESTER_PERSIST: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingester_persist")); -pub struct OwnedGaugeGuard { - gauge: IntGauge, - delta: i64, -} +pub static IN_FLIGHT_INGESTER_REPLICATE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingester_replicate")); -impl std::fmt::Debug for OwnedGaugeGuard { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - self.delta.fmt(f) - } -} +pub static IN_FLIGHT_WAL: LazyLock = LazyLock::new(|| in_flight_data_gauge("wal")); -impl OwnedGaugeGuard { - pub fn from_gauge(gauge: IntGauge) -> Self { - Self { gauge, delta: 0i64 } - } +pub static IN_FLIGHT_FETCH_STREAM: LazyLock = + LazyLock::new(|| in_flight_data_gauge("fetch_stream")); - pub fn get(&self) -> i64 { - self.delta - } +pub static IN_FLIGHT_MULTI_FETCH_STREAM: LazyLock = + LazyLock::new(|| in_flight_data_gauge("multi_fetch_stream")); - pub fn add(&mut self, delta: i64) { - self.gauge.add(delta); - self.delta += delta; - } +pub static IN_FLIGHT_DOC_PROCESSOR_MAILBOX: LazyLock = + LazyLock::new(|| in_flight_data_gauge("doc_processor_mailbox")); - pub fn sub(&mut self, delta: i64) { - self.gauge.sub(delta); - self.delta -= delta; - } -} +pub static IN_FLIGHT_INDEXER_MAILBOX: LazyLock = + LazyLock::new(|| in_flight_data_gauge("indexer_mailbox")); -impl Drop for OwnedGaugeGuard { - fn drop(&mut self) { - self.gauge.sub(self.delta) - } -} +pub static IN_FLIGHT_INDEX_WRITER: LazyLock = + LazyLock::new(|| in_flight_data_gauge("index_writer")); -pub fn metrics_text_payload() -> Result { - let metric_families = prometheus::gather(); - // Arbitrary non-zero size in order to skip a bunch of - // buffer growth-reallocations when encoding metrics. - let mut buffer = String::with_capacity(1024); - let encoder = TextEncoder::new(); - match encoder.encode_utf8(&metric_families, &mut buffer) { - Ok(()) => Ok(buffer), - Err(e) => Err(e.to_string()), - } -} +pub static IN_FLIGHT_FILE_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("file_source")); -#[derive(Clone)] -pub struct MemoryMetrics { - pub active_bytes: IntGauge, - pub allocated_bytes: IntGauge, - pub resident_bytes: IntGauge, - pub in_flight: InFlightDataGauges, -} +pub static IN_FLIGHT_INGEST_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("ingest_source")); -impl Default for MemoryMetrics { - fn default() -> Self { - Self { - active_bytes: new_gauge( - "active_bytes", - "Total number of bytes in active pages allocated by the application, as reported \ - by jemalloc `stats.active`.", - "memory", - &[], - ), - allocated_bytes: new_gauge( - "allocated_bytes", - "Total number of bytes allocated by the application, as reported by jemalloc \ - `stats.allocated`.", - "memory", - &[], - ), - resident_bytes: new_gauge( - "resident_bytes", - " Total number of bytes in physically resident data pages mapped by the \ - allocator, as reported by jemalloc `stats.resident`.", - "memory", - &[], - ), - in_flight: InFlightDataGauges::default(), - } - } -} +pub static IN_FLIGHT_KAFKA_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("kafka_source")); -#[derive(Clone)] -pub struct InFlightDataGauges { - pub rest_server: IntGauge, - pub ingest_router: IntGauge, - pub ingester_persist: IntGauge, - pub ingester_replicate: IntGauge, - pub wal: IntGauge, - pub fetch_stream: IntGauge, - pub multi_fetch_stream: IntGauge, - pub doc_processor_mailbox: IntGauge, - pub indexer_mailbox: IntGauge, - pub index_writer: IntGauge, - in_flight_gauge_vec: IntGaugeVec<1>, -} +pub static IN_FLIGHT_KINESIS_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("kinesis_source")); -impl Default for InFlightDataGauges { - fn default() -> Self { - let in_flight_gauge_vec = new_gauge_vec( - "in_flight_data_bytes", - "Amount of data in-flight in various buffers in bytes.", - "memory", - &[], - ["component"], - ); - Self { - rest_server: in_flight_gauge_vec.with_label_values(["rest_server"]), - ingest_router: in_flight_gauge_vec.with_label_values(["ingest_router"]), - ingester_persist: in_flight_gauge_vec.with_label_values(["ingester_persist"]), - ingester_replicate: in_flight_gauge_vec.with_label_values(["ingester_replicate"]), - wal: in_flight_gauge_vec.with_label_values(["wal"]), - fetch_stream: in_flight_gauge_vec.with_label_values(["fetch_stream"]), - multi_fetch_stream: in_flight_gauge_vec.with_label_values(["multi_fetch_stream"]), - doc_processor_mailbox: in_flight_gauge_vec.with_label_values(["doc_processor_mailbox"]), - indexer_mailbox: in_flight_gauge_vec.with_label_values(["indexer_mailbox"]), - index_writer: in_flight_gauge_vec.with_label_values(["index_writer"]), - in_flight_gauge_vec: in_flight_gauge_vec.clone(), - } - } +pub static IN_FLIGHT_PUBSUB_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("pubsub_source")); + +pub static IN_FLIGHT_PULSAR_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("pulsar_source")); + +pub static IN_FLIGHT_OTHER_SOURCE: LazyLock = + LazyLock::new(|| in_flight_data_gauge("pulsar_source")); + +fn in_flight_data_gauge(component: &'static str) -> Gauge { + let labels = COMPONENT_LABELS.with_values([component]); + gauge!(parent: &*IN_FLIGHT_DATA_BYTES, labels: &labels) } -impl InFlightDataGauges { - #[inline] - pub fn file(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| self.in_flight_gauge_vec.with_label_values(["file_source"])) - } +#[cfg(test)] +mod tests { + use metrics::with_local_recorder; + use metrics_exporter_prometheus::PrometheusBuilder; + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; + use quickwit_metrics::counter; - #[inline] - pub fn ingest(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["ingest_source"]) - }) - } + use super::*; - #[inline] - pub fn kafka(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| self.in_flight_gauge_vec.with_label_values(["kafka_source"])) - } + #[test] + fn maybe_registered_counter_counts_locally() { + let counter = MaybeRegisteredCounter::local(); + let counter_clone = counter.clone(); - #[inline] - pub fn kinesis(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["kinesis_source"]) - }) - } + counter.increment(3); + counter_clone.increment(4); - #[inline] - pub fn pubsub(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pubsub_source"]) - }) + assert_eq!(counter.get(), 7); + assert_eq!(counter_clone.get(), 7); } - #[inline] - pub fn pulsar(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pulsar_source"]) - }) + #[test] + fn maybe_registered_counter_wraps_registered_counter() { + let registered_counter = counter!( + name: "maybe_registered_counter_test", + description: "Maybe registered counter test.", + subsystem: "", + observable: true, + ); + let counter = MaybeRegisteredCounter::registered(registered_counter.clone()); + + counter.increment(5); + + assert_eq!(counter.get(), 5); + assert_eq!(registered_counter.get(), 5); } - #[inline] - pub fn other(&self) -> &IntGauge { - static GAUGE: OnceLock = OnceLock::new(); - GAUGE.get_or_init(|| { - self.in_flight_gauge_vec - .with_label_values(["pulsar_source"]) - }) + #[test] + fn metrics_text_payload_renders_prometheus_handle() { + let recorder = PrometheusBuilder::new().build_recorder(); + set_prometheus_handle(recorder.handle()).expect("Prometheus handle should be set once"); + + with_local_recorder(&recorder, || { + register_info( + "prometheus_payload_info", + "prometheus payload info", + BTreeMap::new(), + ); + }); + + let payload = metrics_text_payload().expect("Prometheus payload should render"); + assert!(payload.contains("# HELP quickwit_prometheus_payload_info")); + assert!(payload.contains("quickwit_prometheus_payload_info 1")); } -} -/// This function returns `index_id` as is if per-index metrics are enabled, or projects it to -/// `"__any__"` otherwise. -pub fn index_label(index_id: &str) -> &str { - static PER_INDEX_METRICS_ENABLED: LazyLock = - LazyLock::new(|| !crate::get_bool_from_env("QW_DISABLE_PER_INDEX_METRICS", false)); + #[test] + fn register_info_records_labeled_counter() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + with_local_recorder(&recorder, || { + let labels = BTreeMap::from([("version", "test".to_string())]); + register_info("build_info_test", "build info test", labels); + }); + + let snapshot = snapshotter.snapshot().into_vec(); + let (_, _, description, value) = snapshot + .into_iter() + .find(|(composite_key, _, _, _)| { + let (_, key) = composite_key.clone().into_parts(); + key.name() == "quickwit_build_info_test" + && key + .labels() + .any(|label| label.key() == "version" && label.value() == "test") + }) + .expect("build info metric should be recorded"); + assert_eq!(description.as_deref(), Some("build info test")); + assert_eq!(value, DebugValue::Counter(1)); + } - if *PER_INDEX_METRICS_ENABLED { - index_id - } else { - "__any__" + #[test] + fn bucket_helpers_are_reexported() { + assert_eq!(linear_buckets(0.0, 1.0, 3).unwrap(), vec![0.0, 1.0, 2.0]); + assert_eq!( + exponential_buckets(1.0, 2.0, 3).unwrap(), + vec![1.0, 2.0, 4.0] + ); } } - -pub static MEMORY_METRICS: LazyLock = LazyLock::new(MemoryMetrics::default); diff --git a/quickwit/quickwit-common/src/runtimes.rs b/quickwit/quickwit-common/src/runtimes.rs index 79ac2611bd9..21f0ae9015c 100644 --- a/quickwit/quickwit-common/src/runtimes.rs +++ b/quickwit/quickwit-common/src/runtimes.rs @@ -17,14 +17,47 @@ use std::sync::OnceLock; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Duration; -use prometheus::{Gauge, IntCounter, IntGauge}; +use quickwit_metrics::{Counter, Gauge, Labels, counter, gauge}; use tokio::runtime::Runtime; use tokio_metrics::{RuntimeMetrics, RuntimeMonitor}; -use crate::metrics::{new_counter, new_float_gauge, new_gauge}; - static RUNTIMES: OnceLock> = OnceLock::new(); +static TOKIO_SCHEDULED_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_scheduled_tasks", + description: "The total number of tasks currently scheduled in workers' local queues.", + subsystem: "runtime", + ) +}); + +static TOKIO_WORKER_BUSY_DURATION_MILLISECONDS_TOTAL: std::sync::LazyLock = + std::sync::LazyLock::new(|| { + counter!( + name: "tokio_worker_busy_duration_milliseconds_total", + description: " The total amount of time worker threads were busy.", + subsystem: "runtime", + ) + }); + +static TOKIO_WORKER_BUSY_RATIO: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_worker_busy_ratio", + description: "The ratio of time worker threads were busy since the last time runtime metrics were collected.", + subsystem: "runtime", + ) +}); + +static TOKIO_WORKER_THREADS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "tokio_worker_threads", + description: "The number of worker threads used by the runtime.", + subsystem: "runtime", + ) +}); + +const RUNTIME_TYPE_LABELS: Labels<1> = Labels::new(["runtime_type"]); + /// Describes which runtime an actor should run on. #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] pub enum RuntimeType { @@ -165,61 +198,44 @@ pub fn scrape_tokio_runtime_metrics(handle: &tokio::runtime::Handle, label: &'st let runtime_monitor = RuntimeMonitor::new(handle); handle.spawn(async move { let mut interval = tokio::time::interval(Duration::from_secs(1)); - let mut prometheus_runtime_metrics = PrometheusRuntimeMetrics::new(label); + let mut runtime_metrics_recorder = RuntimeMetricsRecorder::new(label); for tokio_runtime_metrics in runtime_monitor.intervals() { interval.tick().await; - prometheus_runtime_metrics.update(&tokio_runtime_metrics); + runtime_metrics_recorder.update(&tokio_runtime_metrics); } }); } -struct PrometheusRuntimeMetrics { - scheduled_tasks: IntGauge, - worker_busy_duration_milliseconds_total: IntCounter, +struct RuntimeMetricsRecorder { + scheduled_tasks: Gauge, + worker_busy_duration_milliseconds_total: Counter, worker_busy_ratio: Gauge, - worker_threads: IntGauge, + worker_threads: Gauge, } -impl PrometheusRuntimeMetrics { +impl RuntimeMetricsRecorder { pub fn new(label: &'static str) -> Self { + let labels = RUNTIME_TYPE_LABELS.with_values([label]); Self { - scheduled_tasks: new_gauge( - "tokio_scheduled_tasks", - "The total number of tasks currently scheduled in workers' local queues.", - "runtime", - &[("runtime_type", label)], - ), - worker_busy_duration_milliseconds_total: new_counter( - "tokio_worker_busy_duration_milliseconds_total", - " The total amount of time worker threads were busy.", - "runtime", - &[("runtime_type", label)], - ), - worker_busy_ratio: new_float_gauge( - "tokio_worker_busy_ratio", - "The ratio of time worker threads were busy since the last time runtime metrics \ - were collected.", - "runtime", - &[("runtime_type", label)], - ), - worker_threads: new_gauge( - "tokio_worker_threads", - "The number of worker threads used by the runtime.", - "runtime", - &[("runtime_type", label)], + scheduled_tasks: gauge!(parent: &*TOKIO_SCHEDULED_TASKS, labels: &labels), + worker_busy_duration_milliseconds_total: counter!( + parent: &*TOKIO_WORKER_BUSY_DURATION_MILLISECONDS_TOTAL, + labels: &labels, ), + worker_busy_ratio: gauge!(parent: &*TOKIO_WORKER_BUSY_RATIO, labels: &labels), + worker_threads: gauge!(parent: &*TOKIO_WORKER_THREADS, labels: &labels), } } pub fn update(&mut self, runtime_metrics: &RuntimeMetrics) { self.scheduled_tasks - .set(runtime_metrics.total_local_queue_depth as i64); + .set(runtime_metrics.total_local_queue_depth as f64); self.worker_busy_duration_milliseconds_total - .inc_by(runtime_metrics.total_busy_duration.as_millis() as u64); + .increment(runtime_metrics.total_busy_duration.as_millis() as u64); self.worker_busy_ratio.set(runtime_metrics.busy_ratio()); self.worker_threads - .set(runtime_metrics.workers_count as i64); + .set(runtime_metrics.workers_count as f64); } } diff --git a/quickwit/quickwit-common/src/stream_utils.rs b/quickwit/quickwit-common/src/stream_utils.rs index 00b40ee4b43..c1fe28ccec7 100644 --- a/quickwit/quickwit-common/src/stream_utils.rs +++ b/quickwit/quickwit-common/src/stream_utils.rs @@ -18,12 +18,11 @@ use std::pin::Pin; use bytesize::ByteSize; use futures::{Stream, StreamExt, TryStreamExt, stream}; -use prometheus::IntGauge; +use quickwit_metrics::{Gauge, GaugeGuard}; use tokio::sync::{mpsc, watch}; use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream, WatchStream}; use tracing::warn; -use crate::metrics::GaugeGuard; use crate::tower::RpcName; pub type BoxStream = Pin + Send + Unpin + 'static>>; @@ -77,7 +76,7 @@ where T: Send + 'static pub fn new_bounded_with_gauge( capacity: usize, - gauge: &'static IntGauge, + gauge: &'static Gauge, ) -> (TrackedSender, Self) { let (sender, receiver) = mpsc::channel(capacity); let tracked_sender = TrackedSender { sender, gauge }; @@ -94,7 +93,7 @@ where T: Send + 'static (sender, receiver.into()) } - pub fn new_unbounded_with_gauge(gauge: &'static IntGauge) -> (TrackedUnboundedSender, Self) { + pub fn new_unbounded_with_gauge(gauge: &'static Gauge) -> (TrackedUnboundedSender, Self) { let (sender, receiver) = mpsc::unbounded_channel(); let tracked_sender = TrackedUnboundedSender { sender, gauge }; let receiver_stream = UnboundedReceiverStream::new(receiver) @@ -228,7 +227,7 @@ where T: RpcName } } -pub struct InFlightValue(T, #[allow(dead_code)] GaugeGuard<'static>); +pub struct InFlightValue(T, #[allow(dead_code)] GaugeGuard); impl fmt::Debug for InFlightValue where T: fmt::Debug @@ -239,10 +238,9 @@ where T: fmt::Debug } impl InFlightValue { - pub fn new(value: T, value_size: ByteSize, gauge: &'static IntGauge) -> Self { - let mut gauge_guard = GaugeGuard::from_gauge(gauge); - gauge_guard.add(value_size.as_u64() as i64); - + pub fn new(value: T, value_size: ByteSize, gauge: &'static Gauge) -> Self { + let gauge_guard = GaugeGuard::from_gauge(gauge); + gauge_guard.increment(value_size.as_u64() as f64); Self(value, gauge_guard) } @@ -253,7 +251,7 @@ impl InFlightValue { pub struct TrackedSender { sender: mpsc::Sender>, - gauge: &'static IntGauge, + gauge: &'static Gauge, } impl TrackedSender { @@ -271,7 +269,7 @@ impl TrackedSender { pub struct TrackedUnboundedSender { sender: mpsc::UnboundedSender>, - gauge: &'static IntGauge, + gauge: &'static Gauge, } impl TrackedUnboundedSender { @@ -286,8 +284,9 @@ impl TrackedUnboundedSender { mod tests { use std::sync::LazyLock; + use quickwit_metrics::{Gauge, gauge}; + use super::*; - use crate::metrics::new_gauge; #[tokio::test] async fn test_service_stream_map() { @@ -300,32 +299,37 @@ mod tests { #[tokio::test] async fn test_tracked_service_stream_bounded() { - static TEST_GAUGE: LazyLock = LazyLock::new(|| { - new_gauge("common", "help", "test_tracked_service_stream_bounded", &[]) + static TEST_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "common", + description: "help", + subsystem: "test_tracked_service_stream_bounded", + observable: true, + ) }); let (service_stream_tx, mut service_stream) = ServiceStream::new_bounded_with_gauge(3, &TEST_GAUGE); service_stream_tx.send(1, ByteSize(42)).await.unwrap(); - assert_eq!(TEST_GAUGE.get(), 42); + assert_eq!(TEST_GAUGE.get(), 42.0); service_stream_tx.send(2, ByteSize(1337)).await.unwrap(); - assert_eq!(TEST_GAUGE.get(), 1379); + assert_eq!(TEST_GAUGE.get(), 1379.0); let value = service_stream.next().await.unwrap(); assert_eq!(value, 1); - assert_eq!(TEST_GAUGE.get(), 1337); + assert_eq!(TEST_GAUGE.get(), 1337.0); } #[tokio::test] async fn test_tracked_service_stream_unbounded() { - static TEST_GAUGE: LazyLock = LazyLock::new(|| { - new_gauge( - "common", - "help", - "test_tracked_service_stream_unbounded", - &[], + static TEST_GAUGE: LazyLock = LazyLock::new(|| { + gauge!( + name: "common", + description: "help", + subsystem: "test_tracked_service_stream_unbounded", + observable: true, ) }); @@ -333,13 +337,13 @@ mod tests { ServiceStream::new_unbounded_with_gauge(&TEST_GAUGE); service_stream_tx.send(1, ByteSize(42)).unwrap(); - assert_eq!(TEST_GAUGE.get(), 42); + assert_eq!(TEST_GAUGE.get(), 42.0); service_stream_tx.send(2, ByteSize(1337)).unwrap(); - assert_eq!(TEST_GAUGE.get(), 1379); + assert_eq!(TEST_GAUGE.get(), 1379.0); let value = service_stream.next().await.unwrap(); assert_eq!(value, 1); - assert_eq!(TEST_GAUGE.get(), 1337); + assert_eq!(TEST_GAUGE.get(), 1337.0); } } diff --git a/quickwit/quickwit-common/src/thread_pool.rs b/quickwit/quickwit-common/src/thread_pool.rs index f4b738ef2c0..4d81007c71a 100644 --- a/quickwit/quickwit-common/src/thread_pool.rs +++ b/quickwit/quickwit-common/src/thread_pool.rs @@ -13,14 +13,30 @@ // limitations under the License. use std::fmt; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; use futures::{Future, TryFutureExt}; -use prometheus::IntGauge; +use quickwit_metrics::{Gauge, GaugeGuard, Labels, gauge}; use tokio::sync::oneshot; use tracing::error; -use crate::metrics::{GaugeGuard, IntGaugeVec, OwnedGaugeGuard, new_gauge_vec}; +static THREAD_POOL_ONGOING_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "ongoing_tasks", + description: "number of tasks being currently processed by threads in the thread pool", + subsystem: "thread_pool", + ) +}); + +static THREAD_POOL_PENDING_TASKS: std::sync::LazyLock = std::sync::LazyLock::new(|| { + gauge!( + name: "pending_tasks", + description: "number of tasks waiting in the queue before being processed by the thread pool", + subsystem: "thread_pool", + ) +}); + +const THREAD_POOL_LABELS: Labels<1> = Labels::new(["pool"]); /// An executor backed by a thread pool to run CPU-intensive tasks. /// @@ -29,8 +45,8 @@ use crate::metrics::{GaugeGuard, IntGaugeVec, OwnedGaugeGuard, new_gauge_vec}; #[derive(Clone)] pub struct ThreadPool { thread_pool: Arc, - ongoing_tasks: IntGauge, - pending_tasks: IntGauge, + ongoing_tasks: Gauge, + pending_tasks: Gauge, } impl ThreadPool { @@ -46,8 +62,9 @@ impl ThreadPool { let thread_pool = rayon_pool_builder .build() .expect("failed to spawn thread pool"); - let ongoing_tasks = THREAD_POOL_METRICS.ongoing_tasks.with_label_values([name]); - let pending_tasks = THREAD_POOL_METRICS.pending_tasks.with_label_values([name]); + let labels = THREAD_POOL_LABELS.with_values([name]); + let ongoing_tasks = gauge!(parent: &*THREAD_POOL_ONGOING_TASKS, labels: &labels); + let pending_tasks = gauge!(parent: &*THREAD_POOL_PENDING_TASKS, labels: &labels); ThreadPool { thread_pool: Arc::new(thread_pool), ongoing_tasks, @@ -84,9 +101,8 @@ impl ThreadPool { { let span = tracing::Span::current(); let ongoing_tasks = self.ongoing_tasks.clone(); - let mut pending_tasks_guard: OwnedGaugeGuard = - OwnedGaugeGuard::from_gauge(self.pending_tasks.clone()); - pending_tasks_guard.add(1i64); + let pending_tasks_guard = GaugeGuard::from_gauge(&self.pending_tasks); + pending_tasks_guard.increment(1.0); let (tx, rx) = oneshot::channel(); self.thread_pool.spawn(move || { drop(pending_tasks_guard); @@ -94,8 +110,8 @@ impl ThreadPool { return; } let _guard = span.enter(); - let mut ongoing_task_guard = GaugeGuard::from_gauge(&ongoing_tasks); - ongoing_task_guard.add(1i64); + let _ongoing_task_guard = GaugeGuard::from_gauge(&ongoing_tasks); + _ongoing_task_guard.increment(1.0); let result = cpu_intensive_fn(); let _ = tx.send(result); }); @@ -134,34 +150,6 @@ impl fmt::Display for Panicked { impl std::error::Error for Panicked {} -struct ThreadPoolMetrics { - ongoing_tasks: IntGaugeVec<1>, - pending_tasks: IntGaugeVec<1>, -} - -impl Default for ThreadPoolMetrics { - fn default() -> Self { - ThreadPoolMetrics { - ongoing_tasks: new_gauge_vec( - "ongoing_tasks", - "number of tasks being currently processed by threads in the thread pool", - "thread_pool", - &[], - ["pool"], - ), - pending_tasks: new_gauge_vec( - "pending_tasks", - "number of tasks waiting in the queue before being processed by the thread pool", - "thread_pool", - &[], - ["pool"], - ), - } - } -} - -static THREAD_POOL_METRICS: LazyLock = LazyLock::new(ThreadPoolMetrics::default); - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/quickwit/quickwit-common/src/tower/circuit_breaker.rs b/quickwit/quickwit-common/src/tower/circuit_breaker.rs index 09ada07e187..c9e54750882 100644 --- a/quickwit/quickwit-common/src/tower/circuit_breaker.rs +++ b/quickwit/quickwit-common/src/tower/circuit_breaker.rs @@ -19,7 +19,7 @@ use std::task::{Context, Poll}; use std::time::Duration; use pin_project::pin_project; -use prometheus::IntCounter; +use quickwit_metrics::Counter; use tokio::time::Instant; use tower::{Layer, Service}; @@ -49,7 +49,7 @@ pub struct CircuitBreakerLayer { time_window: Duration, timeout: Duration, evaluator: Evaluator, - circuit_break_total: prometheus::IntCounter, + circuit_break_total: Counter, } pub trait CircuitBreakerEvaluator: Clone { @@ -61,7 +61,7 @@ pub trait CircuitBreakerEvaluator: Clone { self, max_num_errors_per_secs: u32, timeout: Duration, - circuit_break_total: prometheus::IntCounter, + circuit_break_total: Counter, ) -> CircuitBreakerLayer { CircuitBreakerLayer { max_error_count_per_time_window: max_num_errors_per_secs, @@ -102,7 +102,7 @@ struct CircuitBreakerInner { timeout: Duration, evaluator: Evaluator, state: CircuitBreakerState, - circuit_break_total: IntCounter, + circuit_break_total: Counter, } impl CircuitBreakerInner { @@ -125,7 +125,7 @@ impl CircuitBreakerInner { fn receive_error(&mut self) { match self.state { CircuitBreakerState::HalfOpen => { - self.circuit_break_total.inc(); + self.circuit_break_total.increment(1); self.state = CircuitBreakerState::Open { until: Instant::now() + self.timeout, } @@ -144,7 +144,7 @@ impl CircuitBreakerInner { } let now = Instant::now(); if now < error_window_end { - self.circuit_break_total.inc(); + self.circuit_break_total.increment(1); self.state = CircuitBreakerState::Open { until: now + self.timeout, }; @@ -301,8 +301,11 @@ mod tests { const TIMEOUT: Duration = Duration::from_millis(500); - let int_counter: prometheus::IntCounter = - IntCounter::new("circuit_break_total_test", "test circuit breaker counter").unwrap(); + let int_counter = quickwit_metrics::counter!( + name: "circuit_break_total_test", + description: "test circuit breaker counter", + subsystem: "", + ); let mut service = ServiceBuilder::new() .layer(TestCircuitBreakerEvaluator.make_layer(10, TIMEOUT, int_counter)) .service_fn(|_| async { diff --git a/quickwit/quickwit-common/src/tower/metrics.rs b/quickwit/quickwit-common/src/tower/metrics.rs index b2d093adbe3..a66676e23a0 100644 --- a/quickwit/quickwit-common/src/tower/metrics.rs +++ b/quickwit/quickwit-common/src/tower/metrics.rs @@ -13,28 +13,56 @@ // limitations under the License. use std::pin::Pin; +use std::sync::LazyLock; use std::task::{Context, Poll}; use std::time::Instant; use futures::{Future, ready}; use pin_project::{pin_project, pinned_drop}; -use prometheus::exponential_buckets; +use quickwit_metrics::{Counter, Gauge, Histogram, Labels, counter, gauge, histogram}; use tower::{Layer, Service}; -use crate::metrics::{ - HistogramVec, IntCounterVec, IntGaugeVec, new_counter_vec, new_gauge_vec, new_histogram_vec, -}; +use crate::metrics::exponential_buckets; pub trait RpcName { fn rpc_name() -> &'static str; } +static GRPC_REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "grpc_requests_total", + description: "Total number of gRPC requests processed.", + subsystem: "", + ) +}); + +static GRPC_REQUESTS_IN_FLIGHT: LazyLock = LazyLock::new(|| { + gauge!( + name: "grpc_requests_in_flight", + description: "Number of gRPC requests in-flight.", + subsystem: "", + ) +}); + +static GRPC_REQUEST_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "grpc_request_duration_seconds", + description: "Duration of request in seconds.", + subsystem: "", + buckets: exponential_buckets(0.001, 2.0, 12).unwrap(), + ) +}); + +const GRPC_SERVICE_LABELS: Labels<2> = Labels::new(["service", "kind"]); +const GRPC_RPC_LABELS: Labels<1> = Labels::new(["rpc"]); +const GRPC_RPC_STATUS_LABELS: Labels<2> = Labels::new(["rpc", "status"]); + #[derive(Clone)] pub struct GrpcMetrics { inner: S, - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } impl Service for GrpcMetrics @@ -55,7 +83,8 @@ where let rpc_name = R::rpc_name(); let inner = self.inner.call(request); - self.requests_in_flight.with_label_values([rpc_name]).inc(); + let labels = GRPC_RPC_LABELS.with_values([rpc_name]); + gauge!(parent: &self.requests_in_flight, labels: &labels).increment(1.0); ResponseFuture { inner, @@ -71,35 +100,20 @@ where #[derive(Clone)] pub struct GrpcMetricsLayer { - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } impl GrpcMetricsLayer { pub fn new(subsystem: &'static str, kind: &'static str) -> Self { + let labels = GRPC_SERVICE_LABELS.with_values([subsystem, kind]); Self { - requests_total: new_counter_vec( - "grpc_requests_total", - "Total number of gRPC requests processed.", - subsystem, - &[("kind", kind)], - ["rpc", "status"], - ), - requests_in_flight: new_gauge_vec( - "grpc_requests_in_flight", - "Number of gRPC requests in-flight.", - subsystem, - &[("kind", kind)], - ["rpc"], - ), - request_duration_seconds: new_histogram_vec( - "grpc_request_duration_seconds", - "Duration of request in seconds.", - subsystem, - &[("kind", kind)], - ["rpc", "status"], - exponential_buckets(0.001, 2.0, 12).unwrap(), + requests_total: counter!(parent: &*GRPC_REQUESTS_TOTAL, labels: &labels), + requests_in_flight: gauge!(parent: &*GRPC_REQUESTS_IN_FLIGHT, labels: &labels), + request_duration_seconds: histogram!( + parent: &*GRPC_REQUEST_DURATION_SECONDS, + labels: &labels, ), } } @@ -118,7 +132,7 @@ impl Layer for GrpcMetricsLayer { } } -/// Response future for [`PrometheusMetrics`]. +/// Response future for [`GrpcMetrics`]. #[pin_project(PinnedDrop)] pub struct ResponseFuture { #[pin] @@ -126,24 +140,24 @@ pub struct ResponseFuture { start: Instant, rpc_name: &'static str, status: &'static str, - requests_total: IntCounterVec<2>, - requests_in_flight: IntGaugeVec<1>, - request_duration_seconds: HistogramVec<2>, + requests_total: Counter, + requests_in_flight: Gauge, + request_duration_seconds: Histogram, } #[pinned_drop] impl PinnedDrop for ResponseFuture { fn drop(self: Pin<&mut Self>) { let elapsed = self.start.elapsed().as_secs_f64(); - let label_values = [self.rpc_name, self.status]; - - self.requests_total.with_label_values(label_values).inc(); - self.request_duration_seconds - .with_label_values(label_values) - .observe(elapsed); - self.requests_in_flight - .with_label_values([self.rpc_name]) - .dec(); + let rpc_status_labels = GRPC_RPC_STATUS_LABELS.with_values([self.rpc_name, self.status]); + counter!(parent: &self.requests_total, labels: &rpc_status_labels).increment(1); + histogram!( + parent: &self.request_duration_seconds, + labels: &rpc_status_labels, + ) + .record(elapsed); + let rpc_labels = GRPC_RPC_LABELS.with_values([self.rpc_name]); + gauge!(parent: &self.requests_in_flight, labels: &rpc_labels).decrement(1.0); } } @@ -162,6 +176,9 @@ where F: Future> #[cfg(test)] mod tests { + use metrics::with_local_recorder; + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; + use super::*; struct HelloRequest; @@ -180,59 +197,67 @@ mod tests { } } - #[tokio::test] - async fn test_grpc_metrics() { - let layer = GrpcMetricsLayer::new("quickwit_test", "server"); - - let mut hello_service = - layer - .clone() - .layer(tower::service_fn(|request: HelloRequest| async move { - Ok::<_, ()>(request) - })); - let mut goodbye_service = - layer - .clone() - .layer(tower::service_fn(|request: GoodbyeRequest| async move { - Ok::<_, ()>(request) - })); - - hello_service.call(HelloRequest).await.unwrap(); - + #[test] + fn test_grpc_metrics() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + with_local_recorder(&recorder, || { + futures::executor::block_on(async { + let layer = GrpcMetricsLayer::new("quickwit_test", "server"); + + let mut hello_service = + layer + .clone() + .layer(tower::service_fn(|request: HelloRequest| async move { + Ok::<_, ()>(request) + })); + let mut goodbye_service = + layer + .clone() + .layer(tower::service_fn(|request: GoodbyeRequest| async move { + Ok::<_, ()>(request) + })); + + hello_service.call(HelloRequest).await.unwrap(); + goodbye_service.call(GoodbyeRequest).await.unwrap(); + + let hello_future = hello_service.call(HelloRequest); + drop(hello_future); + }); + }); + + let snapshot = snapshotter.snapshot().into_vec(); + let counter_value = |rpc: &str, status: &str| { + snapshot.iter().find_map(|(composite_key, _, _, value)| { + let (_, key) = composite_key.clone().into_parts(); + let labels = key + .labels() + .map(|label| (label.key(), label.value())) + .collect::>(); + if key.name() == "quickwit_grpc_requests_total" + && labels.contains(&("service", "quickwit_test")) + && labels.contains(&("kind", "server")) + && labels.contains(&("rpc", rpc)) + && labels.contains(&("status", status)) + { + Some(value) + } else { + None + } + }) + }; assert_eq!( - layer - .requests_total - .with_label_values(["hello", "success"]) - .get(), - 1 + counter_value("hello", "success"), + Some(&DebugValue::Counter(1)) ); assert_eq!( - layer - .requests_total - .with_label_values(["goodbye", "success"]) - .get(), - 0 + counter_value("goodbye", "success"), + Some(&DebugValue::Counter(1)) ); - - goodbye_service.call(GoodbyeRequest).await.unwrap(); - - assert_eq!( - layer - .requests_total - .with_label_values(["goodbye", "success"]) - .get(), - 1 - ); - - let hello_future = hello_service.call(HelloRequest); - drop(hello_future); - assert_eq!( - layer - .requests_total - .with_label_values(["hello", "cancelled"]) - .get(), - 1 + counter_value("hello", "cancelled"), + Some(&DebugValue::Counter(1)) ); } } diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml index e7d9d012dd0..e0c8e951e98 100644 --- a/quickwit/quickwit-control-plane/Cargo.toml +++ b/quickwit/quickwit-control-plane/Cargo.toml @@ -30,6 +30,7 @@ ulid = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-ingest = { workspace = true } quickwit-metastore = { workspace = true } diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index 1056aba6eb8..4453c4596da 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -219,7 +219,7 @@ impl Actor for ControlPlane { } async fn initialize(&mut self, ctx: &ActorContext) -> Result<(), ActorExitStatus> { - crate::metrics::CONTROL_PLANE_METRICS.restart_total.inc(); + crate::metrics::RESTART_TOTAL.increment(1); self.model .load_from_metastore(&mut self.metastore, ctx.progress()) @@ -568,17 +568,13 @@ fn convert_metastore_error( // It will be up to the client to decide what to do there. error!(err=?metastore_error, transaction_outcome="aborted", "metastore error"); } - crate::metrics::CONTROL_PLANE_METRICS - .metastore_error_aborted - .inc(); + crate::metrics::METASTORE_ERROR_ABORTED.increment(1); Ok(Err(ControlPlaneError::Metastore(metastore_error))) } else { // If the metastore transaction may have been executed, we need to restart the control plane // so that it gets resynced with the metastore state. error!(error=?metastore_error, transaction_outcome="maybe-executed", "metastore error"); - crate::metrics::CONTROL_PLANE_METRICS - .metastore_error_maybe_executed - .inc(); + crate::metrics::METASTORE_ERROR_MAYBE_EXECUTED.increment(1); Err(ActorExitStatus::from(anyhow::anyhow!(metastore_error))) } } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 300f6a9d151..9a395269a2a 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -295,7 +295,7 @@ impl IndexingScheduler { // Prefer not calling this method directly, and instead call // `ControlPlane::rebuild_indexing_plan_debounced`. pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel) { - crate::metrics::CONTROL_PLANE_METRICS.schedule_total.inc(); + crate::metrics::SCHEDULE_TOTAL.increment(1); let notify_on_drop = self.next_rebuild_tracker.start_rebuild(); @@ -330,7 +330,7 @@ impl IndexingScheduler { ); let shard_locality_metrics = get_shard_locality_metrics(&new_physical_plan, &shard_locations); - crate::metrics::CONTROL_PLANE_METRICS.set_shard_locality_metrics(shard_locality_metrics); + shard_locality_metrics.publish(); if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { let plans_diff = get_indexing_plans_diff( last_applied_plan.indexing_tasks_per_indexer(), @@ -397,7 +397,7 @@ impl IndexingScheduler { notify_on_drop: Option>, ) { debug!(new_physical_plan=?new_physical_plan, "apply physical indexing plan"); - crate::metrics::CONTROL_PLANE_METRICS.apply_plan_total.inc(); + crate::metrics::APPLY_PLAN_TOTAL.increment(1); for (node_id, indexing_tasks) in new_physical_plan.indexing_tasks_per_indexer() { // We don't want to block on a slow indexer so we apply this change asynchronously // TODO not blocking is cool, but we need to make sure there is not accumulation diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 6a64c183361..5eafeb86296 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -1024,9 +1024,7 @@ impl IngestController { let shards_to_rebalance: Vec = self.compute_shards_to_rebalance(model); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(shards_to_rebalance.len() as i64); + crate::metrics::REBALANCE_SHARDS.set(shards_to_rebalance.len() as f64); if shards_to_rebalance.is_empty() { debug!("skipping rebalance: no shards to rebalance"); @@ -1049,16 +1047,12 @@ impl IngestController { .await .inspect_err(|error| { error!(%error, "failed to open shards during rebalance"); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(0); + crate::metrics::REBALANCE_SHARDS.set(0.0); })?; let num_opened_shards: usize = per_source_num_opened_shards.values().sum(); - crate::metrics::CONTROL_PLANE_METRICS - .rebalance_shards - .set(num_opened_shards as i64); + crate::metrics::REBALANCE_SHARDS.set(num_opened_shards as f64); for source_uid in per_source_num_opened_shards.keys() { // We temporarily disable the ability the scale down the number of shards for diff --git a/quickwit/quickwit-control-plane/src/metrics.rs b/quickwit/quickwit-control-plane/src/metrics.rs index 7935f18a1e8..0d3b4df5b49 100644 --- a/quickwit/quickwit-control-plane/src/metrics.rs +++ b/quickwit/quickwit-control-plane/src/metrics.rs @@ -14,9 +14,7 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - IntCounter, IntGauge, IntGaugeVec, new_counter, new_gauge, new_gauge_vec, -}; +use quickwit_metrics::{Counter, Gauge, Labels, counter, gauge}; #[derive(Debug, Clone, Copy)] pub struct ShardLocalityMetrics { @@ -24,114 +22,95 @@ pub struct ShardLocalityMetrics { pub num_local_shards: usize, } -pub struct ControlPlaneMetrics { - // Indexes and shards tracked by the control plane. - pub indexes_total: IntGauge, - pub open_shards: IntGaugeVec<1>, - pub closed_shards: IntGaugeVec<1>, - - // Operations performed by the control plane. - pub apply_plan_total: IntCounter, - pub rebalance_shards: IntGauge, - pub restart_total: IntCounter, - pub schedule_total: IntCounter, - - // Metastore errors. - pub metastore_error_aborted: IntCounter, - pub metastore_error_maybe_executed: IntCounter, - - // Indexing plan metrics. - pub local_shards: IntGauge, - pub remote_shards: IntGauge, -} - -impl ControlPlaneMetrics { - pub fn set_shard_locality_metrics(&self, shard_locality_metrics: ShardLocalityMetrics) { - self.local_shards - .set(shard_locality_metrics.num_local_shards as i64); - self.remote_shards - .set(shard_locality_metrics.num_remote_shards as i64); +impl ShardLocalityMetrics { + pub fn publish(self) { + LOCAL_SHARDS.set(self.num_local_shards as f64); + REMOTE_SHARDS.set(self.num_remote_shards as f64); } } -impl Default for ControlPlaneMetrics { - fn default() -> Self { - let open_shards = new_gauge_vec( - "shards", - "Number of open and closed shards tracked by the ingest controller", - "control_plane", - &[("state", "open")], - ["index_id"], - ); - let closed_shards = new_gauge_vec( - "shards", - "Number of open and closed shards tracked by the ingest controller", - "control_plane", - &[("state", "closed")], - ["index_id"], - ); - let indexed_shards = new_gauge_vec( - "indexed_shards", - "Number of (remote/local) shards in the indexing plan", - "control_plane", - &[], - ["locality"], - ); - let local_shards = indexed_shards.with_label_values(["local"]); - let remote_shards = indexed_shards.with_label_values(["remote"]); - - ControlPlaneMetrics { - indexes_total: new_gauge( - "indexes_total", - "Number of indexes tracked by the control plane.", - "control_plane", - &[], - ), - open_shards, - closed_shards, - apply_plan_total: new_counter( - "apply_plan_total", - "Number of control plane `apply plan` operations.", - "control_plane", - &[], - ), - rebalance_shards: new_gauge( - "rebalance_shards", - "Number of shards rebalanced by the control plane.", - "control_plane", - &[], - ), - restart_total: new_counter( - "restart_total", - "Number of control plane restarts.", - "control_plane", - &[], - ), - schedule_total: new_counter( - "schedule_total", - "Number of control plane `schedule` operations.", - "control_plane", - &[], - ), - metastore_error_aborted: new_counter( - "metastore_error_aborted", - "Number of aborted metastore transaction (= do not trigger a control plane \ - restart)", - "control_plane", - &[], - ), - metastore_error_maybe_executed: new_counter( - "metastore_error_maybe_executed", - "Number of metastore transaction with an uncertain outcome (= do trigger a \ - control plane restart)", - "control_plane", - &[], - ), - local_shards, - remote_shards, - } - } -} +pub(crate) static INDEXES_TOTAL: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexes_total", + description: "Number of indexes tracked by the control plane.", + subsystem: "control_plane", + ) +}); + +static SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "shards", + description: "Number of open and closed shards tracked by the ingest controller", + subsystem: "control_plane", + ) +}); + +pub(crate) static OPEN_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*SHARDS, "state" => "open")); + +pub(crate) static CLOSED_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*SHARDS, "state" => "closed")); + +pub(crate) const INDEX_ID_LABELS: Labels<1> = Labels::new(["index_id"]); + +static INDEXED_SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexed_shards", + description: "Number of (remote/local) shards in the indexing plan", + subsystem: "control_plane", + ) +}); + +pub(crate) static LOCAL_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*INDEXED_SHARDS, "locality" => "local")); + +pub(crate) static REMOTE_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*INDEXED_SHARDS, "locality" => "remote")); + +pub(crate) static APPLY_PLAN_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "apply_plan_total", + description: "Number of control plane `apply plan` operations.", + subsystem: "control_plane", + ) +}); + +pub(crate) static REBALANCE_SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "rebalance_shards", + description: "Number of shards rebalanced by the control plane.", + subsystem: "control_plane", + ) +}); + +pub(crate) static RESTART_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "restart_total", + description: "Number of control plane restarts.", + subsystem: "control_plane", + ) +}); + +pub(crate) static SCHEDULE_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "schedule_total", + description: "Number of control plane `schedule` operations.", + subsystem: "control_plane", + ) +}); + +pub(crate) static METASTORE_ERROR_ABORTED: LazyLock = LazyLock::new(|| { + counter!( + name: "metastore_error_aborted", + description: "Number of aborted metastore transaction (= do not trigger a control plane restart)", + subsystem: "control_plane", + ) +}); -pub static CONTROL_PLANE_METRICS: LazyLock = - LazyLock::new(ControlPlaneMetrics::default); +pub(crate) static METASTORE_ERROR_MAYBE_EXECUTED: LazyLock = LazyLock::new(|| { + counter!( + name: "metastore_error_maybe_executed", + description: "Number of metastore transaction with an uncertain outcome (= do trigger a control plane restart)", + subsystem: "control_plane", + ) +}); diff --git a/quickwit/quickwit-control-plane/src/model/mod.rs b/quickwit/quickwit-control-plane/src/model/mod.rs index 0d0431a67ce..8ffa677e315 100644 --- a/quickwit/quickwit-control-plane/src/model/mod.rs +++ b/quickwit/quickwit-control-plane/src/model/mod.rs @@ -167,9 +167,7 @@ impl ControlPlaneModel { } fn update_metrics(&self) { - crate::metrics::CONTROL_PLANE_METRICS - .indexes_total - .set(self.index_table.len() as i64); + crate::metrics::INDEXES_TOTAL.set(self.index_table.len() as f64); } pub(crate) fn source_configs(&self) -> impl Iterator + '_ { diff --git a/quickwit/quickwit-control-plane/src/model/shard_table.rs b/quickwit/quickwit-control-plane/src/model/shard_table.rs index 623ae3e6224..aafa344c17d 100644 --- a/quickwit/quickwit-control-plane/src/model/shard_table.rs +++ b/quickwit/quickwit-control-plane/src/model/shard_table.rs @@ -461,14 +461,17 @@ impl ShardTable { // can update the metrics for this specific index. if index_label == index_id { let shard_stats = table_entry.shards_stats(); - crate::metrics::CONTROL_PLANE_METRICS - .open_shards - .with_label_values([index_label]) - .set(shard_stats.num_open_shards as i64); - crate::metrics::CONTROL_PLANE_METRICS - .closed_shards - .with_label_values([index_label]) - .set(shard_stats.num_closed_shards as i64); + let labels = crate::metrics::INDEX_ID_LABELS.with_values([index_label.to_string()]); + quickwit_metrics::gauge!( + parent: &crate::metrics::OPEN_SHARDS, + labels: &labels, + ) + .set(shard_stats.num_open_shards as f64); + quickwit_metrics::gauge!( + parent: &crate::metrics::CLOSED_SHARDS, + labels: &labels, + ) + .set(shard_stats.num_closed_shards as f64); return; } // Per-index metrics are disabled, so we update the metrics for all sources. @@ -482,14 +485,17 @@ impl ShardTable { num_closed_shards += 1; } } - crate::metrics::CONTROL_PLANE_METRICS - .open_shards - .with_label_values([index_label]) - .set(num_open_shards as i64); - crate::metrics::CONTROL_PLANE_METRICS - .closed_shards - .with_label_values([index_label]) - .set(num_closed_shards as i64); + let labels = crate::metrics::INDEX_ID_LABELS.with_values([index_label.to_string()]); + quickwit_metrics::gauge!( + parent: &crate::metrics::OPEN_SHARDS, + labels: &labels, + ) + .set(num_open_shards as f64); + quickwit_metrics::gauge!( + parent: &crate::metrics::CLOSED_SHARDS, + labels: &labels, + ) + .set(num_closed_shards as f64); } pub fn update_shards( diff --git a/quickwit/quickwit-directories/src/caching_directory.rs b/quickwit/quickwit-directories/src/caching_directory.rs index 6e9461f5493..4da1981eed2 100644 --- a/quickwit/quickwit-directories/src/caching_directory.rs +++ b/quickwit/quickwit-directories/src/caching_directory.rs @@ -37,9 +37,8 @@ impl CachingDirectory { /// Warning: The resulting CacheDirectory will cache all information without ever /// removing any item from the cache. pub fn new_unbounded(underlying: Arc) -> CachingDirectory { - let byte_range_cache = ByteRangeCache::with_infinite_capacity( - &quickwit_storage::STORAGE_METRICS.shortlived_cache, - ); + let byte_range_cache = + ByteRangeCache::with_infinite_capacity(&quickwit_storage::SHORTLIVED_CACHE); CachingDirectory::new(underlying, byte_range_cache) } diff --git a/quickwit/quickwit-index-management/Cargo.toml b/quickwit/quickwit-index-management/Cargo.toml index d303125f65f..35978b9f5e7 100644 --- a/quickwit/quickwit-index-management/Cargo.toml +++ b/quickwit/quickwit-index-management/Cargo.toml @@ -21,6 +21,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-indexing = { workspace = true } quickwit-metastore = { workspace = true } diff --git a/quickwit/quickwit-index-management/src/garbage_collection.rs b/quickwit/quickwit-index-management/src/garbage_collection.rs index dc2655dbfcd..2b964b0a4e4 100644 --- a/quickwit/quickwit-index-management/src/garbage_collection.rs +++ b/quickwit/quickwit-index-management/src/garbage_collection.rs @@ -20,13 +20,13 @@ use std::time::Duration; use anyhow::Context; use futures::{Future, StreamExt}; use itertools::Itertools; -use quickwit_common::metrics::IntCounter; use quickwit_common::pretty::PrettySample; use quickwit_common::{Progress, rate_limited_info}; use quickwit_metastore::{ ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitInfo, SplitMetadata, SplitState, }; +use quickwit_metrics::Counter; use quickwit_proto::metastore::{ DeleteSplitsRequest, ListSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreService, MetastoreServiceClient, @@ -41,9 +41,9 @@ use tracing::{error, instrument}; const DELETE_SPLITS_BATCH_SIZE: usize = 10_000; pub struct GcMetrics { - pub deleted_splits: IntCounter, - pub deleted_bytes: IntCounter, - pub failed_splits: IntCounter, + pub deleted_splits: Counter, + pub deleted_bytes: Counter, + pub failed_splits: Counter, } pub(crate) trait RecordGcMetrics { @@ -53,9 +53,9 @@ pub(crate) trait RecordGcMetrics { impl RecordGcMetrics for Option { fn record(&self, num_deleted_splits: usize, num_deleted_bytes: u64, num_failed_splits: usize) { if let Some(metrics) = self { - metrics.deleted_splits.inc_by(num_deleted_splits as u64); - metrics.deleted_bytes.inc_by(num_deleted_bytes); - metrics.failed_splits.inc_by(num_failed_splits as u64); + metrics.deleted_splits.increment(num_deleted_splits as u64); + metrics.deleted_bytes.increment(num_deleted_bytes); + metrics.failed_splits.increment(num_failed_splits as u64); } } } diff --git a/quickwit/quickwit-indexing/Cargo.toml b/quickwit/quickwit-indexing/Cargo.toml index 2a0d581797d..10a0ef98b75 100644 --- a/quickwit/quickwit-indexing/Cargo.toml +++ b/quickwit/quickwit-indexing/Cargo.toml @@ -55,6 +55,7 @@ quickwit-actors = { workspace = true } quickwit-aws = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-directories = { workspace = true } quickwit-doc-mapper = { workspace = true } diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index 407c55ff526..d0320023544 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -20,11 +20,11 @@ use anyhow::{Context, bail}; use async_trait::async_trait; use bytes::Bytes; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; -use quickwit_common::metrics::IntCounter; use quickwit_common::rate_limited_tracing::rate_limited_warn; use quickwit_common::runtimes::RuntimeType; use quickwit_config::{SourceInputFormat, TransformConfig}; use quickwit_doc_mapper::{DocMapper, DocParsingError, JsonObject}; +use quickwit_metrics::{Counter, counter}; use quickwit_opentelemetry::otlp::{ JsonLogIterator, JsonSpanIterator, OtlpLogsError, OtlpTracesError, parse_otlp_logs_json, parse_otlp_logs_protobuf, parse_otlp_spans_json, parse_otlp_spans_protobuf, @@ -45,7 +45,6 @@ use crate::models::{ }; const PLAIN_TEXT: &str = "plain_text"; - pub(super) struct JsonDoc { json_obj: JsonObject, num_bytes: usize, @@ -270,8 +269,8 @@ impl From> for JsonDocIterator { #[derive(Debug)] pub struct DocProcessorCounter { pub num_docs: AtomicU64, - pub num_docs_metric: IntCounter, - pub num_bytes_metric: IntCounter, + pub num_docs_metric: Counter, + pub num_bytes_metric: Counter, } impl Serialize for DocProcessorCounter { @@ -284,15 +283,18 @@ impl Serialize for DocProcessorCounter { impl DocProcessorCounter { fn for_index_and_doc_processor_outcome(index: &str, outcome: &str) -> DocProcessorCounter { let index_label = quickwit_common::metrics::index_label(index); - let labels = [index_label, outcome]; + let labels = crate::metrics::INDEX_DOCS_PROCESSED_STATUS_LABELS + .with_values([index_label.to_string(), outcome.to_string()]); DocProcessorCounter { num_docs: Default::default(), - num_docs_metric: crate::metrics::INDEXER_METRICS - .processed_docs_total - .with_label_values(labels), - num_bytes_metric: crate::metrics::INDEXER_METRICS - .processed_bytes - .with_label_values(labels), + num_docs_metric: counter!( + parent: &crate::metrics::PROCESSED_DOCS_TOTAL, + labels: &labels, + ), + num_bytes_metric: counter!( + parent: &crate::metrics::PROCESSED_BYTES, + labels: &labels, + ), } } @@ -303,8 +305,8 @@ impl DocProcessorCounter { fn record_doc(&self, num_bytes: u64) { self.num_docs.fetch_add(1, Ordering::Relaxed); - self.num_docs_metric.inc(); - self.num_bytes_metric.inc_by(num_bytes); + self.num_docs_metric.increment(1); + self.num_bytes_metric.increment(num_bytes); } } diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index b2257ed203f..43673f86c12 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -27,12 +27,12 @@ use quickwit_actors::{ Actor, ActorContext, ActorExitStatus, Command, Handler, Mailbox, QueueCapacity, }; use quickwit_common::io::IoControls; -use quickwit_common::metrics::GaugeGuard; use quickwit_common::runtimes::RuntimeType; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::IndexingSettings; use quickwit_doc_mapper::DocMapper; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::{IndexingPipelineId, PipelineMetrics}; use quickwit_proto::metastore::{ LastDeleteOpstampRequest, MetastoreService, MetastoreServiceClient, @@ -219,9 +219,8 @@ impl IndexerState { let publish_lock = self.publish_lock.clone(); let publish_token_opt = self.publish_token_opt.clone(); - let mut split_builders_guard = - GaugeGuard::from_gauge(&crate::metrics::INDEXER_METRICS.split_builders); - split_builders_guard.add(1); + let split_builders_guard = GaugeGuard::from_gauge(&crate::metrics::SPLIT_BUILDERS); + split_builders_guard.increment(1.0); let workbench = IndexingWorkbench { workbench_id, @@ -233,11 +232,7 @@ impl IndexerState { publish_lock, publish_token_opt, last_delete_opstamp, - memory_usage: GaugeGuard::from_gauge( - &quickwit_common::metrics::MEMORY_METRICS - .in_flight - .index_writer, - ), + memory_usage: GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INDEX_WRITER), cooperative_indexing_period, split_builders_guard, }; @@ -335,7 +330,7 @@ impl IndexerState { memory_usage_delta += mem_usage_after as i64 - mem_usage_before as i64; ctx.record_progress(); } - memory_usage.add(memory_usage_delta); + memory_usage.increment(memory_usage_delta as f64); Ok(()) } } @@ -358,8 +353,8 @@ struct IndexingWorkbench { // We use this value to set the `delete_opstamp` of the workbench splits. last_delete_opstamp: u64, // Number of bytes declared as used by tantivy. - memory_usage: GaugeGuard<'static>, - split_builders_guard: GaugeGuard<'static>, + memory_usage: GaugeGuard, + split_builders_guard: GaugeGuard, cooperative_indexing_period: Option, } @@ -583,7 +578,7 @@ impl Indexer { fn memory_usage(&self) -> ByteSize { if let Some(workbench) = &self.indexing_workbench_opt { - ByteSize(workbench.memory_usage.get() as u64) + ByteSize(workbench.memory_usage.delta() as u64) } else { ByteSize(0u64) } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 3b43e47c105..107869932e8 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -23,12 +23,12 @@ use quickwit_actors::{ QueueCapacity, Supervisable, }; use quickwit_common::KillSwitch; -use quickwit_common::metrics::OwnedGaugeGuard; use quickwit_common::pubsub::EventBroker; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::{IndexingSettings, RetentionPolicy, SourceConfig}; use quickwit_doc_mapper::DocMapper; use quickwit_ingest::IngesterPool; +use quickwit_metrics::{GaugeGuard, counter, gauge}; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{MetastoreError, MetastoreServiceClient}; use quickwit_proto::types::ShardId; @@ -88,7 +88,7 @@ pub struct IndexingPipeline { // requiring a respawn of the pipeline. // We keep the list of shards here however, to reassign them after a respawn. shard_ids: BTreeSet, - _indexing_pipelines_gauge_guard: OwnedGaugeGuard, + _indexing_pipelines_gauge_guard: GaugeGuard, } #[async_trait] @@ -123,10 +123,17 @@ impl Actor for IndexingPipeline { impl IndexingPipeline { pub fn new(params: IndexingPipelineParams) -> Self { - let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS - .indexing_pipelines - .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let labels = crate::metrics::INDEX_LABELS.with_values([params + .pipeline_id + .index_uid + .index_id + .clone()]); + let indexing_pipelines_gauge = gauge!( + parent: &crate::metrics::INDEXING_PIPELINES, + labels: &labels, + ); + let indexing_pipelines_gauge_guard = GaugeGuard::from_gauge(&indexing_pipelines_gauge); + indexing_pipelines_gauge_guard.increment(1.0); let params_fingerprint = params.params_fingerprint; IndexingPipeline { params, @@ -311,21 +318,19 @@ impl IndexingPipeline { let (publisher_mailbox, publisher_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["publisher"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "publisher", + )) .spawn(publisher); let sequencer = Sequencer::new(publisher_mailbox); let (sequencer_mailbox, sequencer_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["sequencer"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "sequencer", + )) .set_kill_switch(self.kill_switch.clone()) .spawn(sequencer); @@ -342,11 +347,10 @@ impl IndexingPipeline { ); let (uploader_mailbox, uploader_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["uploader"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "uploader", + )) .set_kill_switch(self.kill_switch.clone()) .spawn(uploader); @@ -377,11 +381,10 @@ impl IndexingPipeline { ); let (indexer_mailbox, indexer_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["indexer"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "indexer", + )) .set_kill_switch(self.kill_switch.clone()) .spawn(indexer); @@ -395,11 +398,10 @@ impl IndexingPipeline { )?; let (doc_processor_mailbox, doc_processor_handle) = ctx .spawn_actor() - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["doc_processor"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "doc_processor", + )) .set_kill_switch(self.kill_switch.clone()) .spawn(doc_processor); let source_runtime = SourceRuntime { diff --git a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs index b901d9f804a..587b567ab30 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs @@ -30,6 +30,7 @@ use quickwit_metastore::{ ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitMetadata, SplitState, }; +use quickwit_metrics::counter; use quickwit_proto::indexing::MergePipelineId; use quickwit_proto::metastore::{ ListSplitsRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceClient, @@ -272,11 +273,10 @@ impl MergePipeline { let (merge_publisher_mailbox, merge_publisher_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_publisher"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "merge_publisher", + )) .spawn(merge_publisher); // Merge uploader @@ -322,11 +322,10 @@ impl MergePipeline { let (merge_executor_mailbox, merge_executor_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_executor"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "merge_executor", + )) .spawn(merge_executor); let merge_split_downloader = MergeSplitDownloader { @@ -338,11 +337,10 @@ impl MergePipeline { let (merge_split_downloader_mailbox, merge_split_downloader_handle) = ctx .spawn_actor() .set_kill_switch(self.kill_switch.clone()) - .set_backpressure_micros_counter( - crate::metrics::INDEXER_METRICS - .backpressure_micros - .with_label_values(["merge_split_downloader"]), - ) + .set_backpressure_micros_counter(counter!( + parent: &crate::metrics::BACKPRESSURE_MICROS, + "actor_name" => "merge_split_downloader", + )) .spawn(merge_split_downloader); // Merge planner @@ -397,9 +395,7 @@ impl MergePipeline { handles.merge_planner.refresh_observe(); handles.merge_uploader.refresh_observe(); handles.merge_publisher.refresh_observe(); - let num_ongoing_merges = crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .get(); + let num_ongoing_merges = crate::metrics::ONGOING_MERGE_OPERATIONS.get(); self.statistics = self .previous_generations_statistics .clone() @@ -409,7 +405,7 @@ impl MergePipeline { ) .set_generation(self.statistics.generation) .set_num_spawn_attempts(self.statistics.num_spawn_attempts) - .set_ongoing_merges(usize::try_from(num_ongoing_merges).unwrap_or(0)); + .set_ongoing_merges(num_ongoing_merges.max(0.0) as usize); } async fn perform_health_check( diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index 70fe17c621b..fc8bfff5567 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -226,12 +226,8 @@ impl MergeSchedulerService { _merge_permit: merge_permit, }; self.pending_merge_bytes -= merge_task.merge_operation.total_num_bytes(); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set(self.pending_merge_queue.len() as i64); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + crate::metrics::PENDING_MERGE_OPERATIONS.set(self.pending_merge_queue.len() as f64); + crate::metrics::PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); match split_downloader_mailbox.try_send_message(merge_task) { Ok(_) => {} Err(quickwit_actors::TrySendError::Full(_)) => { @@ -273,15 +269,10 @@ impl MergeSchedulerService { merge_permit, }; self.pending_merge_bytes -= parquet_merge_task.merge_operation.total_size_bytes(); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set( - self.pending_merge_queue.len() as i64 - + self.pending_parquet_merge_queue.len() as i64, - ); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + crate::metrics::PENDING_MERGE_OPERATIONS.set( + (self.pending_merge_queue.len() + self.pending_parquet_merge_queue.len()) as f64, + ); + crate::metrics::PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); match split_downloader_mailbox.try_send_message(parquet_merge_task) { Ok(_) => {} Err(quickwit_actors::TrySendError::Full(_)) => { @@ -295,9 +286,7 @@ impl MergeSchedulerService { let num_merges = self.merge_concurrency as i64 - self.merge_semaphore.available_permits() as i64; - crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .set(num_merges); + crate::metrics::ONGOING_MERGE_OPERATIONS.set(num_merges as f64); } } @@ -381,12 +370,8 @@ impl Handler for MergeSchedulerService { }; self.pending_merge_bytes += scheduled_merge.merge_operation.total_num_bytes(); self.pending_merge_queue.push(scheduled_merge); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set(self.pending_merge_queue.len() as i64); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + crate::metrics::PENDING_MERGE_OPERATIONS.set(self.pending_merge_queue.len() as f64); + crate::metrics::PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); self.schedule_pending_merges(ctx); Ok(()) } @@ -467,15 +452,9 @@ impl Handler for MergeSchedulerService { }; self.pending_merge_bytes += scheduled.merge_operation.total_size_bytes(); self.pending_parquet_merge_queue.push(scheduled); - crate::metrics::INDEXER_METRICS - .pending_merge_operations - .set( - self.pending_merge_queue.len() as i64 - + self.pending_parquet_merge_queue.len() as i64, - ); - crate::metrics::INDEXER_METRICS - .pending_merge_bytes - .set(self.pending_merge_bytes as i64); + crate::metrics::PENDING_MERGE_OPERATIONS + .set((self.pending_merge_queue.len() + self.pending_parquet_merge_queue.len()) as f64); + crate::metrics::PENDING_MERGE_BYTES.set(self.pending_merge_bytes as f64); self.schedule_pending_merges(ctx); Ok(()) } diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs index d1347f2d4bf..7ce7b312d17 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_merge_pipeline.rs @@ -367,9 +367,7 @@ impl ParquetMergePipeline { handles.merge_planner.refresh_observe(); handles.merge_uploader.refresh_observe(); handles.merge_publisher.refresh_observe(); - let num_ongoing_merges = crate::metrics::INDEXER_METRICS - .ongoing_merge_operations - .get(); + let num_ongoing_merges = crate::metrics::ONGOING_MERGE_OPERATIONS.get(); self.statistics = self .previous_generations_statistics .clone() @@ -379,7 +377,7 @@ impl ParquetMergePipeline { ) .set_generation(self.statistics.generation) .set_num_spawn_attempts(self.statistics.num_spawn_attempts) - .set_ongoing_merges(usize::try_from(num_ongoing_merges).unwrap_or(0)); + .set_ongoing_merges(num_ongoing_merges.max(0.0) as usize); } async fn perform_health_check( diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs index 3702f727f93..5e77d9a2a05 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/parquet_uploader.rs @@ -27,6 +27,7 @@ use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_common::spawn_named_task; use quickwit_metastore::StageParquetSplitsRequestExt; +use quickwit_metrics::gauge; use quickwit_parquet_engine::split::{ParquetSplitKind, ParquetSplitMetadata}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient}; use quickwit_storage::Storage; @@ -36,7 +37,6 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use super::{ParquetSplitBatch, ParquetSplitsUpdate}; use crate::actors::sequencer::{Sequencer, SequencerCommand}; use crate::actors::{Publisher, UploaderCounters, UploaderType}; -use crate::metrics::INDEXER_METRICS; /// Concurrent upload permits for metrics uploader. /// Uses same permit pool as indexer uploads. @@ -122,10 +122,11 @@ impl ParquetUploader { let _guard = ctx.protect_zone(); let concurrent_upload_permits = CONCURRENT_UPLOAD_PERMITS_METRICS .get_or_init(|| Semaphore::const_new(self.max_concurrent_uploads)); - let gauge = INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["metrics"]); - gauge.set(concurrent_upload_permits.available_permits() as i64); + let gauge = gauge!( + parent: &crate::metrics::AVAILABLE_CONCURRENT_UPLOAD_PERMITS, + "component" => "metrics", + ); + gauge.set(concurrent_upload_permits.available_permits() as f64); concurrent_upload_permits .acquire() .await diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs index bf596995b89..98b366861f6 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/pipeline.rs @@ -33,11 +33,11 @@ use quickwit_actors::{ QueueCapacity, Supervisable, }; use quickwit_common::KillSwitch; -use quickwit_common::metrics::OwnedGaugeGuard; use quickwit_common::pubsub::EventBroker; use quickwit_common::temp_dir::TempDirectory; use quickwit_config::{IndexingSettings, SourceConfig}; use quickwit_ingest::IngesterPool; +use quickwit_metrics::{GaugeGuard, gauge}; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{MetastoreError, MetastoreServiceClient}; use quickwit_proto::types::ShardId; @@ -111,7 +111,7 @@ pub struct MetricsPipeline { handles_opt: Option, kill_switch: KillSwitch, shard_ids: BTreeSet, - _indexing_pipelines_gauge_guard: OwnedGaugeGuard, + _indexing_pipelines_gauge_guard: GaugeGuard, } #[async_trait] @@ -144,10 +144,17 @@ impl Actor for MetricsPipeline { impl MetricsPipeline { pub fn new(params: MetricsPipelineParams) -> Self { - let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS - .indexing_pipelines - .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let labels = crate::metrics::INDEX_LABELS.with_values([params + .pipeline_id + .index_uid + .index_id + .clone()]); + let indexing_pipelines_gauge = gauge!( + parent: &crate::metrics::INDEXING_PIPELINES, + labels: &labels, + ); + let indexing_pipelines_gauge_guard = GaugeGuard::from_gauge(&indexing_pipelines_gauge); + indexing_pipelines_gauge_guard.increment(1.0); let params_fingerprint = params.params_fingerprint; MetricsPipeline { params, diff --git a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs index 0599e03bf2c..7b6831a5b6f 100644 --- a/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs +++ b/quickwit/quickwit-indexing/src/actors/metrics_pipeline/processed_parquet_batch.rs @@ -20,8 +20,8 @@ use std::fmt; use arrow::record_batch::RecordBatch; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; /// Batch of parquet data as Arrow RecordBatch for the parquet indexing pipeline. /// @@ -35,7 +35,7 @@ pub struct ProcessedParquetBatch { /// Force commit flag - when true, accumulator should flush immediately. pub force_commit: bool, /// Memory tracking gauge guard. - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl ProcessedParquetBatch { @@ -65,8 +65,9 @@ impl ProcessedParquetBatch { .map(|col| col.get_array_memory_size() as i64) .sum(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.indexer_mailbox); - gauge_guard.add(memory_size); + let gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INDEXER_MAILBOX); + gauge_guard.increment(memory_size as f64); Self { batches, diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 8e1c0d56afb..e9c80e1b051 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -28,6 +28,7 @@ use quickwit_common::spawn_named_task; use quickwit_config::RetentionPolicy; use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{SplitMetadata, StageSplitsRequestExt}; +use quickwit_metrics::gauge; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, StageSplitsRequest}; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; use quickwit_proto::types::{IndexUid, PublishToken}; @@ -40,7 +41,6 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use crate::actors::Publisher; use crate::actors::sequencer::{Sequencer, SequencerCommand}; use crate::merge_policy::{MergePolicy, MergeTask}; -use crate::metrics::INDEXER_METRICS; use crate::models::{ EmptySplit, PackagedSplit, PackagedSplitBatch, PublishLock, SplitsUpdate, create_split_metadata, }; @@ -203,26 +203,29 @@ impl Uploader { match self.uploader_type { UploaderType::IndexUploader => ( &CONCURRENT_UPLOAD_PERMITS_INDEX, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["indexer"]), + gauge!( + parent: &crate::metrics::AVAILABLE_CONCURRENT_UPLOAD_PERMITS, + "component" => "indexer", + ), ), UploaderType::MergeUploader => ( &CONCURRENT_UPLOAD_PERMITS_MERGE, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["merger"]), + gauge!( + parent: &crate::metrics::AVAILABLE_CONCURRENT_UPLOAD_PERMITS, + "component" => "merger", + ), ), UploaderType::DeleteUploader => ( &CONCURRENT_UPLOAD_PERMITS_MERGE, - INDEXER_METRICS - .available_concurrent_upload_permits - .with_label_values(["merger"]), + gauge!( + parent: &crate::metrics::AVAILABLE_CONCURRENT_UPLOAD_PERMITS, + "component" => "merger", + ), ), }; let concurrent_upload_permits = concurrent_upload_permits_once_cell .get_or_init(|| Semaphore::const_new(self.max_concurrent_split_uploads)); - concurrent_upload_permits_gauge.set(concurrent_upload_permits.available_permits() as i64); + concurrent_upload_permits_gauge.set(concurrent_upload_permits.available_permits() as f64); concurrent_upload_permits .acquire() .await diff --git a/quickwit/quickwit-indexing/src/metrics.rs b/quickwit/quickwit-indexing/src/metrics.rs index 98ca19636a2..699c2568e19 100644 --- a/quickwit/quickwit-indexing/src/metrics.rs +++ b/quickwit/quickwit-indexing/src/metrics.rs @@ -14,103 +14,92 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, new_counter, new_counter_vec, new_gauge, - new_gauge_vec, -}; +use quickwit_metrics::{Counter, Gauge, Labels, counter, gauge}; -pub struct IndexerMetrics { - pub processed_docs_total: IntCounterVec<2>, - pub processed_bytes: IntCounterVec<2>, - pub indexing_pipelines: IntGaugeVec<1>, - pub backpressure_micros: IntCounterVec<1>, - pub available_concurrent_upload_permits: IntGaugeVec<1>, - pub split_builders: IntGauge, - pub ongoing_merge_operations: IntGauge, - pub pending_merge_operations: IntGauge, - pub pending_merge_bytes: IntGauge, - // We use a lazy counter, as most users do not use Kafka. - #[cfg_attr(not(feature = "kafka"), allow(dead_code))] - pub kafka_rebalance_total: LazyLock, -} +pub(crate) const INDEX_DOCS_PROCESSED_STATUS_LABELS: Labels<2> = + Labels::new(["index", "docs_processed_status"]); -impl Default for IndexerMetrics { - fn default() -> Self { - IndexerMetrics { - processed_docs_total: new_counter_vec( - "processed_docs_total", - "Number of processed docs by index, source and processed status in [valid, \ - schema_error, parse_error, transform_error]", - "indexing", - &[], - ["index", "docs_processed_status"], - ), - processed_bytes: new_counter_vec( - "processed_bytes", - "Number of bytes of processed documents by index, source and processed status in \ - [valid, schema_error, parse_error, transform_error]", - "indexing", - &[], - ["index", "docs_processed_status"], - ), - indexing_pipelines: new_gauge_vec( - "indexing_pipelines", - "Number of running indexing pipelines", - "indexing", - &[], - ["index"], - ), - backpressure_micros: new_counter_vec( - "backpressure_micros", - "Amount of time spent in backpressure (in micros). This time only includes the \ - amount of time spent waiting for a place in the queue of another actor.", - "indexing", - &[], - ["actor_name"], - ), - available_concurrent_upload_permits: new_gauge_vec( - "concurrent_upload_available_permits_num", - "Number of available concurrent upload permits by component in [merger, indexer]", - "indexing", - &[], - ["component"], - ), - split_builders: new_gauge( - "split_builders", - "Number of existing index writer instances.", - "indexing", - &[], - ), - ongoing_merge_operations: new_gauge( - "ongoing_merge_operations", - "Number of ongoing merge operations", - "indexing", - &[], - ), - pending_merge_operations: new_gauge( - "pending_merge_operations", - "Number of pending merge operations", - "indexing", - &[], - ), - pending_merge_bytes: new_gauge( - "pending_merge_bytes", - "Number of pending merge bytes", - "indexing", - &[], - ), - kafka_rebalance_total: LazyLock::new(|| { - new_counter( - "kafka_rebalance_total", - "Number of kafka rebalances", - "indexing", - &[], - ) - }), - } - } -} +pub(crate) static PROCESSED_DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "processed_docs_total", + description: "Number of processed docs by index, source and processed status in [valid, schema_error, parse_error, transform_error]", + subsystem: "indexing", + ) +}); -/// `INDEXER_METRICS` exposes indexing related metrics through a prometheus -/// endpoint. -pub static INDEXER_METRICS: LazyLock = LazyLock::new(IndexerMetrics::default); +pub(crate) static PROCESSED_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "processed_bytes", + description: "Number of bytes of processed documents by index, source and processed status in [valid, schema_error, parse_error, transform_error]", + subsystem: "indexing", + ) +}); + +pub(crate) static INDEXING_PIPELINES: LazyLock = LazyLock::new(|| { + gauge!( + name: "indexing_pipelines", + description: "Number of running indexing pipelines", + subsystem: "indexing", + ) +}); + +pub(crate) const INDEX_LABELS: Labels<1> = Labels::new(["index"]); + +pub(crate) static BACKPRESSURE_MICROS: LazyLock = LazyLock::new(|| { + counter!( + name: "backpressure_micros", + description: "Amount of time spent in backpressure (in micros). This time only includes the amount of time spent waiting for a place in the queue of another actor.", + subsystem: "indexing", + ) +}); + +pub(crate) static AVAILABLE_CONCURRENT_UPLOAD_PERMITS: LazyLock = LazyLock::new(|| { + gauge!( + name: "concurrent_upload_available_permits_num", + description: "Number of available concurrent upload permits by component in [merger, indexer]", + subsystem: "indexing", + ) +}); + +pub(crate) static SPLIT_BUILDERS: LazyLock = LazyLock::new(|| { + gauge!( + name: "split_builders", + description: "Number of existing index writer instances.", + subsystem: "indexing", + ) +}); + +pub(crate) static ONGOING_MERGE_OPERATIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "ongoing_merge_operations", + description: "Number of ongoing merge operations", + subsystem: "indexing", + observable: true, + ) +}); + +pub(crate) static PENDING_MERGE_OPERATIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "pending_merge_operations", + description: "Number of pending merge operations", + subsystem: "indexing", + ) +}); + +pub(crate) static PENDING_MERGE_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "pending_merge_bytes", + description: "Number of pending merge bytes", + subsystem: "indexing", + ) +}); + +// We use a lazy counter, as most users do not use Kafka. +#[cfg_attr(not(feature = "kafka"), allow(dead_code))] +pub(crate) static KAFKA_REBALANCE_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "kafka_rebalance_total", + description: "Number of kafka rebalances", + subsystem: "indexing", + ) +}); diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index cd272bdc34c..03728fe2f6a 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -16,9 +16,9 @@ use std::fmt; use std::path::Path; use quickwit_common::io::IoControls; -use quickwit_common::metrics::GaugeGuard; use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::IndexCheckpointDelta; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::types::{DocMappingUid, IndexUid, PublishToken}; use tantivy::IndexBuilder; @@ -182,8 +182,8 @@ pub struct IndexedSplitBatchBuilder { pub publish_token_opt: Option, pub commit_trigger: CommitTrigger, pub batch_parent_span: Span, - pub memory_usage: GaugeGuard<'static>, - pub _split_builders_guard: GaugeGuard<'static>, + pub memory_usage: GaugeGuard, + pub _split_builders_guard: GaugeGuard, } /// Sends notifications to the Publisher that the last batch of splits was empty. diff --git a/quickwit/quickwit-indexing/src/models/processed_doc.rs b/quickwit/quickwit-indexing/src/models/processed_doc.rs index bed695aa1d4..424956e9d17 100644 --- a/quickwit/quickwit-indexing/src/models/processed_doc.rs +++ b/quickwit/quickwit-indexing/src/models/processed_doc.rs @@ -14,8 +14,8 @@ use std::fmt; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; use tantivy::{DateTime, TantivyDocument}; pub struct ProcessedDoc { @@ -41,7 +41,7 @@ pub struct ProcessedDocBatch { pub docs: Vec, pub checkpoint_delta: SourceCheckpointDelta, pub force_commit: bool, - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl ProcessedDocBatch { @@ -51,8 +51,9 @@ impl ProcessedDocBatch { force_commit: bool, ) -> Self { let delta = docs.iter().map(|doc| doc.num_bytes as i64).sum::(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.indexer_mailbox); - gauge_guard.add(delta); + let gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INDEXER_MAILBOX); + gauge_guard.increment(delta as f64); Self { docs, checkpoint_delta, diff --git a/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs b/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs index f88d9fcac2b..d5178afd123 100644 --- a/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs +++ b/quickwit/quickwit-indexing/src/models/raw_doc_batch.rs @@ -15,8 +15,8 @@ use std::fmt; use bytes::Bytes; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_metastore::checkpoint::SourceCheckpointDelta; +use quickwit_metrics::GaugeGuard; pub struct RawDocBatch { // Do not directly append documents to this vector; otherwise, in-flight metrics will be @@ -24,7 +24,7 @@ pub struct RawDocBatch { pub docs: Vec, pub checkpoint_delta: SourceCheckpointDelta, pub force_commit: bool, - _gauge_guard: GaugeGuard<'static>, + _gauge_guard: GaugeGuard, } impl RawDocBatch { @@ -34,9 +34,9 @@ impl RawDocBatch { force_commit: bool, ) -> Self { let delta = docs.iter().map(|doc| doc.len() as i64).sum::(); - let mut gauge_guard = - GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.doc_processor_mailbox); - gauge_guard.add(delta); + let gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_DOC_PROCESSOR_MAILBOX); + gauge_guard.increment(delta as f64); Self { docs, @@ -67,7 +67,8 @@ impl fmt::Debug for RawDocBatch { impl Default for RawDocBatch { fn default() -> Self { - let _gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.doc_processor_mailbox); + let _gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_DOC_PROCESSOR_MAILBOX); Self { docs: Vec::new(), checkpoint_delta: SourceCheckpointDelta::default(), diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index 4403fa0f547..cc7c0f1772c 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -665,7 +665,6 @@ mod tests { use itertools::Itertools; use quickwit_actors::{ActorContext, Universe}; use quickwit_common::ServiceStream; - use quickwit_common::metrics::MEMORY_METRICS; use quickwit_common::stream_utils::InFlightValue; use quickwit_config::{IndexingSettings, SourceConfig, SourceParams}; use quickwit_ingest::IngesterPoolEntry; @@ -1437,7 +1436,7 @@ mod tests { let in_flight_value = InFlightValue::new( fetch_message, batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM, ); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); @@ -1454,7 +1453,7 @@ mod tests { let in_flight_value = InFlightValue::new( fetch_message, batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM, ); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); @@ -1468,7 +1467,7 @@ mod tests { let in_flight_value = InFlightValue::new( fetch_message, ByteSize(0), - &MEMORY_METRICS.in_flight.fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM, ); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); @@ -1529,7 +1528,7 @@ mod tests { let in_flight_value = InFlightValue::new( fetch_message, batch_size, - &MEMORY_METRICS.in_flight.fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM, ); fetch_message_tx.send(Ok(in_flight_value)).await.unwrap(); diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index 93ce5b3dc37..9b654a6b02f 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -127,7 +127,7 @@ macro_rules! return_if_err { /// impl ConsumerContext for RdKafkaContext { fn pre_rebalance(&self, _consumer: &BaseConsumer, rebalance: &Rebalance) { - crate::metrics::INDEXER_METRICS.kafka_rebalance_total.inc(); + crate::metrics::KAFKA_REBALANCE_TOTAL.increment(1); quickwit_common::rate_limited_info!(limit_per_min = 3, topic = self.topic, "rebalance"); if let Rebalance::Revoke(tpl) = rebalance { let partitions = collect_partitions(tpl, &self.topic); diff --git a/quickwit/quickwit-indexing/src/source/mod.rs b/quickwit/quickwit-indexing/src/source/mod.rs index 0e696eaea0f..a5f946ec7a5 100644 --- a/quickwit/quickwit-indexing/src/source/mod.rs +++ b/quickwit/quickwit-indexing/src/source/mod.rs @@ -92,7 +92,6 @@ pub use pulsar_source::{PulsarSource, PulsarSourceFactory}; #[cfg(feature = "sqs")] pub use queue_sources::sqs_queue; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler}; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_common::pubsub::EventBroker; use quickwit_common::runtimes::RuntimeType; use quickwit_config::{ @@ -101,6 +100,7 @@ use quickwit_config::{ use quickwit_ingest::IngesterPool; use quickwit_metastore::IndexMetadataResponseExt; use quickwit_metastore::checkpoint::{SourceCheckpoint, SourceCheckpointDelta}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::metastore::{ IndexMetadataRequest, MetastoreError, MetastoreResult, MetastoreService, @@ -519,7 +519,7 @@ pub(super) struct BatchBuilder { num_bytes: u64, checkpoint_delta: SourceCheckpointDelta, force_commit: bool, - gauge_guard: GaugeGuard<'static>, + gauge_guard: GaugeGuard, } impl BatchBuilder { @@ -529,13 +529,13 @@ impl BatchBuilder { pub fn with_capacity(capacity: usize, source_type: SourceType) -> Self { let gauge = match source_type { - SourceType::File => MEMORY_METRICS.in_flight.file(), - SourceType::IngestV2 => MEMORY_METRICS.in_flight.ingest(), - SourceType::Kafka => MEMORY_METRICS.in_flight.kafka(), - SourceType::Kinesis => MEMORY_METRICS.in_flight.kinesis(), - SourceType::PubSub => MEMORY_METRICS.in_flight.pubsub(), - SourceType::Pulsar => MEMORY_METRICS.in_flight.pulsar(), - _ => MEMORY_METRICS.in_flight.other(), + SourceType::File => &quickwit_common::metrics::IN_FLIGHT_FILE_SOURCE, + SourceType::IngestV2 => &quickwit_common::metrics::IN_FLIGHT_INGEST_SOURCE, + SourceType::Kafka => &quickwit_common::metrics::IN_FLIGHT_KAFKA_SOURCE, + SourceType::Kinesis => &quickwit_common::metrics::IN_FLIGHT_KINESIS_SOURCE, + SourceType::PubSub => &quickwit_common::metrics::IN_FLIGHT_PUBSUB_SOURCE, + SourceType::Pulsar => &quickwit_common::metrics::IN_FLIGHT_PULSAR_SOURCE, + _ => &quickwit_common::metrics::IN_FLIGHT_OTHER_SOURCE, }; let gauge_guard = GaugeGuard::from_gauge(gauge); @@ -551,8 +551,8 @@ impl BatchBuilder { pub fn add_doc(&mut self, doc: Bytes) { let num_bytes = doc.len(); self.docs.push(doc); - self.gauge_guard.add(num_bytes as i64); self.num_bytes += num_bytes as u64; + self.gauge_guard.increment(num_bytes as f64); } pub fn force_commit(&mut self) { @@ -567,7 +567,7 @@ impl BatchBuilder { pub fn clear(&mut self) { self.docs.clear(); self.checkpoint_delta = SourceCheckpointDelta::default(); - self.gauge_guard.sub(self.num_bytes as i64); + self.gauge_guard.increment(-(self.num_bytes as f64)); self.num_bytes = 0; } } diff --git a/quickwit/quickwit-ingest/Cargo.toml b/quickwit/quickwit-ingest/Cargo.toml index 1bf15d76fd4..03121cf4cc6 100644 --- a/quickwit/quickwit-ingest/Cargo.toml +++ b/quickwit/quickwit-ingest/Cargo.toml @@ -38,6 +38,7 @@ utoipa = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-ingest/src/ingest_api_service.rs b/quickwit/quickwit-ingest/src/ingest_api_service.rs index 7ee8d0b232b..8dc08487459 100644 --- a/quickwit/quickwit-ingest/src/ingest_api_service.rs +++ b/quickwit/quickwit-ingest/src/ingest_api_service.rs @@ -22,11 +22,11 @@ use quickwit_actors::{ }; use quickwit_common::runtimes::RuntimeType; use quickwit_common::tower::Cost; +use quickwit_metrics::counter; use quickwit_proto::ingest::RateLimitingCause; use tracing::{error, info}; use ulid::Ulid; -use crate::metrics::INGEST_METRICS; use crate::notifications::Notifications; use crate::{ CommitType, CreateQueueIfNotExistsRequest, CreateQueueIfNotExistsResponse, CreateQueueRequest, @@ -201,12 +201,13 @@ impl IngestApiService { } num_docs += batch_num_docs; - INGEST_METRICS - .ingested_docs_bytes_valid - .inc_by(batch_num_bytes as u64); - INGEST_METRICS - .ingested_docs_valid - .inc_by(batch_num_docs as u64); + counter!( + parent: &crate::metrics::DOCS_BYTES_TOTAL, + "validity" => "valid", + ) + .increment(batch_num_bytes as u64); + counter!(parent: &crate::metrics::DOCS_TOTAL, "validity" => "valid") + .increment(batch_num_docs as u64); } // TODO we could fsync here and disable autosync to have better i/o perfs. Ok(( diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs index 6531c893c9f..6306399121a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs @@ -30,7 +30,6 @@ use tracing::{debug, warn}; use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key}; use crate::RateMibPerSec; -use crate::ingest_v2::metrics::INGEST_V2_METRICS; use crate::ingest_v2::state::WeakIngesterState; const ONE_MIB: ByteSize = ByteSize::mib(1); @@ -195,12 +194,10 @@ impl ShardThroughputTimeSeriesMap { .average() .as_u64() .div_ceil(ONE_MIB.as_u64()); - INGEST_V2_METRICS - .shard_st_throughput_mib - .observe(short_term_ingestion_rate_mib_per_sec_u64 as f64); - INGEST_V2_METRICS - .shard_lt_throughput_mib - .observe(long_term_ingestion_rate_mib_per_sec_u64 as f64); + crate::ingest_v2::metrics::SHARD_ST_THROUGHPUT_MIB + .record(short_term_ingestion_rate_mib_per_sec_u64 as f64); + crate::ingest_v2::metrics::SHARD_LT_THROUGHPUT_MIB + .record(long_term_ingestion_rate_mib_per_sec_u64 as f64); let short_term_ingestion_rate = RateMibPerSec(short_term_ingestion_rate_mib_per_sec_u64 as u16); @@ -300,10 +297,8 @@ impl BroadcastLocalShardsTask { } } } - INGEST_V2_METRICS.open_shards.set(num_open_shards as i64); - INGEST_V2_METRICS - .closed_shards - .set(num_closed_shards as i64); + crate::ingest_v2::metrics::OPEN_SHARDS.set(num_open_shards as f64); + crate::ingest_v2::metrics::CLOSED_SHARDS.set(num_closed_shards as f64); let snapshot = LocalShardsSnapshot { per_source_shard_infos, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index 73c1fb2858d..3d45ef7f571 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -22,7 +22,6 @@ use bytes::{BufMut, BytesMut}; use bytesize::ByteSize; use futures::StreamExt; use mrecordlog::Record; -use quickwit_common::metrics::MEMORY_METRICS; use quickwit_common::retry::RetryParams; use quickwit_common::stream_utils::{InFlightValue, TrackedSender}; use quickwit_common::{ServiceStream, spawn_named_task}; @@ -81,8 +80,10 @@ impl FetchStreamTask { .as_u64() .map(|offset| offset + 1) .unwrap_or_default(); - let (fetch_message_tx, fetch_stream) = - ServiceStream::new_bounded_with_gauge(3, &MEMORY_METRICS.in_flight.fetch_stream); + let (fetch_message_tx, fetch_stream) = ServiceStream::new_bounded_with_gauge( + 3, + &quickwit_common::metrics::IN_FLIGHT_FETCH_STREAM, + ); let mut fetch_task = Self { shard_id: open_fetch_stream_request.shard_id().clone(), queue_id: open_fetch_stream_request.queue_id(), @@ -559,7 +560,7 @@ async fn fault_tolerant_fetch_stream( let in_flight_value = InFlightValue::new( fetch_message, batch_size, - &MEMORY_METRICS.in_flight.multi_fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_MULTI_FETCH_STREAM, ); if fetch_message_tx.send(Ok(in_flight_value)).await.is_err() { // The consumer was dropped. @@ -572,7 +573,7 @@ async fn fault_tolerant_fetch_stream( let in_flight_value = InFlightValue::new( fetch_message, ByteSize(0), - &MEMORY_METRICS.in_flight.multi_fetch_stream, + &quickwit_common::metrics::IN_FLIGHT_MULTI_FETCH_STREAM, ); // We ignore the send error if the consumer was dropped because we're going // to return anyway. diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index c176f3d9313..ef4a4b1a6ad 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -25,11 +25,11 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use mrecordlog::error::CreateQueueError; use quickwit_cluster::Cluster; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_common::pretty::PrettyDisplay; use quickwit_common::pubsub::{EventBroker, EventSubscriber}; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; use quickwit_common::{ServiceStream, rate_limited_error, rate_limited_warn}; +use quickwit_metrics::{GaugeGuard, counter}; use quickwit_proto::control_plane::{ AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient, }; @@ -51,7 +51,6 @@ use super::broadcast::{BroadcastIngesterCapacityScoreTask, BroadcastLocalShardsT use super::doc_mapper::validate_doc_batch; use super::fetch::FetchStreamTask; use super::idle::CloseIdleShardsTask; -use super::metrics::INGEST_V2_METRICS; use super::models::IngesterShard; use super::mrecordlog_utils::{ AppendDocBatchError, append_non_empty_doc_batch, check_enough_capacity, @@ -332,10 +331,11 @@ impl Ingester { advise_reset_shards_response.shards_to_truncate.len(), now.elapsed().pretty_display() ); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["success"]) - .inc(); + counter!( + parent: &crate::ingest_v2::metrics::RESET_SHARDS_OPERATIONS_TOTAL, + "status" => "success", + ) + .increment(1); let wal_usage = state_guard.mrecordlog.resource_usage(); report_wal_usage(wal_usage); @@ -343,18 +343,20 @@ impl Ingester { Ok(Err(error)) => { warn!("advise reset shards request failed: {error}"); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["error"]) - .inc(); + counter!( + parent: &crate::ingest_v2::metrics::RESET_SHARDS_OPERATIONS_TOTAL, + "status" => "error", + ) + .increment(1); } Err(_) => { warn!("advise reset shards request timed out"); - INGEST_V2_METRICS - .reset_shards_operations_total - .with_label_values(["timeout"]) - .inc(); + counter!( + parent: &crate::ingest_v2::metrics::RESET_SHARDS_OPERATIONS_TOTAL, + "status" => "timeout", + ) + .increment(1); } }; // We still hold the permit while sleeping so we effectively rate limit the reset shards @@ -567,12 +569,16 @@ impl Ingester { }; if valid_doc_batch.is_empty() { - crate::metrics::INGEST_METRICS - .ingested_docs_invalid - .inc_by(parse_failures.len() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_invalid - .inc_by(original_batch_num_bytes); + counter!( + parent: &crate::metrics::DOCS_TOTAL, + "validity" => "invalid", + ) + .increment(parse_failures.len() as u64); + counter!( + parent: &crate::metrics::DOCS_BYTES_TOTAL, + "validity" => "invalid", + ) + .increment(original_batch_num_bytes); let persist_success = PersistSuccess { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, @@ -586,19 +592,27 @@ impl Ingester { continue; }; - crate::metrics::INGEST_METRICS - .ingested_docs_valid - .inc_by(valid_doc_batch.num_docs() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_valid - .inc_by(valid_doc_batch.num_bytes() as u64); + counter!( + parent: &crate::metrics::DOCS_TOTAL, + "validity" => "valid", + ) + .increment(valid_doc_batch.num_docs() as u64); + counter!( + parent: &crate::metrics::DOCS_BYTES_TOTAL, + "validity" => "valid", + ) + .increment(valid_doc_batch.num_bytes() as u64); if !parse_failures.is_empty() { - crate::metrics::INGEST_METRICS - .ingested_docs_invalid - .inc_by(parse_failures.len() as u64); - crate::metrics::INGEST_METRICS - .ingested_docs_bytes_invalid - .inc_by(original_batch_num_bytes - valid_doc_batch.num_bytes() as u64); + counter!( + parent: &crate::metrics::DOCS_TOTAL, + "validity" => "invalid", + ) + .increment(parse_failures.len() as u64); + counter!( + parent: &crate::metrics::DOCS_BYTES_TOTAL, + "validity" => "invalid", + ) + .increment(original_batch_num_bytes - valid_doc_batch.num_bytes() as u64); } let valid_batch_num_bytes = valid_doc_batch.num_bytes() as u64; shard.rate_meter.update(valid_batch_num_bytes); @@ -1111,8 +1125,9 @@ impl IngesterService for Ingester { _ => None, }) .sum::(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingester_persist); - gauge_guard.add(request_size_bytes as i64); + let mut _gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INGESTER_PERSIST); + _gauge_guard.increment(request_size_bytes as f64); self.persist_inner(persist_request).await } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs index 5e034f1bd36..4841f835e0a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs @@ -15,162 +15,150 @@ use std::sync::LazyLock; use mrecordlog::ResourceUsage; -use quickwit_common::metrics::{ - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, exponential_buckets, - linear_buckets, new_counter_vec, new_gauge, new_gauge_vec, new_histogram, new_histogram_vec, -}; +use quickwit_common::metrics::{exponential_buckets, linear_buckets}; +use quickwit_metrics::{Counter, Gauge, Histogram, Labels, counter, gauge, histogram}; -// Counter vec counting the different outcomes of ingest requests as -// measure at the end of the router work. -// -// The counter are counting persist subrequests. -pub(crate) struct IngestResultMetrics { - pub success: IntCounter, - pub circuit_breaker: IntCounter, - pub unspecified: IntCounter, - pub index_not_found: IntCounter, - pub source_not_found: IntCounter, - pub internal: IntCounter, - pub no_shards_available: IntCounter, - pub shard_rate_limited: IntCounter, - pub wal_full: IntCounter, - pub timeout: IntCounter, - pub router_timeout: IntCounter, - pub router_load_shedding: IntCounter, - pub load_shedding: IntCounter, - pub shard_not_found: IntCounter, - pub unavailable: IntCounter, -} +static INGEST_RESULT_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "ingest_result_total", + description: "Number of ingest requests by result", + subsystem: "ingest", + ) +}); -impl Default for IngestResultMetrics { - fn default() -> Self { - let ingest_result_total_vec = new_counter_vec::<1>( - "ingest_result_total", - "Number of ingest requests by result", - "ingest", - &[], - ["result"], - ); - Self { - success: ingest_result_total_vec.with_label_values(["success"]), - circuit_breaker: ingest_result_total_vec.with_label_values(["circuit_breaker"]), - unspecified: ingest_result_total_vec.with_label_values(["unspecified"]), - index_not_found: ingest_result_total_vec.with_label_values(["index_not_found"]), - source_not_found: ingest_result_total_vec.with_label_values(["source_not_found"]), - internal: ingest_result_total_vec.with_label_values(["internal"]), - no_shards_available: ingest_result_total_vec.with_label_values(["no_shards_available"]), - shard_rate_limited: ingest_result_total_vec.with_label_values(["shard_rate_limited"]), - wal_full: ingest_result_total_vec.with_label_values(["wal_full"]), - timeout: ingest_result_total_vec.with_label_values(["timeout"]), - router_timeout: ingest_result_total_vec.with_label_values(["router_timeout"]), - router_load_shedding: ingest_result_total_vec - .with_label_values(["router_load_shedding"]), - load_shedding: ingest_result_total_vec.with_label_values(["load_shedding"]), - unavailable: ingest_result_total_vec.with_label_values(["unavailable"]), - shard_not_found: ingest_result_total_vec.with_label_values(["shard_not_found"]), - } - } -} +pub(super) static INGEST_RESULT_SUCCESS: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "success")); -pub(super) struct IngestV2Metrics { - pub reset_shards_operations_total: IntCounterVec<1>, - pub open_shards: IntGauge, - pub closed_shards: IntGauge, - pub shard_lt_throughput_mib: Histogram, - pub shard_st_throughput_mib: Histogram, - pub wal_acquire_lock_requests_in_flight: IntGaugeVec<2>, - pub wal_acquire_lock_request_duration_secs: HistogramVec<2>, - pub wal_disk_used_bytes: IntGauge, - pub wal_memory_used_bytes: IntGauge, - pub ingest_results: IngestResultMetrics, - pub ingest_attempts: IntCounterVec<1>, -} +pub(super) static INGEST_RESULT_CIRCUIT_BREAKER: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "circuit_breaker")); -impl Default for IngestV2Metrics { - fn default() -> Self { - Self { - ingest_results: IngestResultMetrics::default(), - ingest_attempts: new_counter_vec::<1>( - "ingest_attempts", - "Number of routing attempts by AZ locality", - "ingest", - &[], - ["az_routing"], - ), - reset_shards_operations_total: new_counter_vec( - "reset_shards_operations_total", - "Total number of reset shards operations performed.", - "ingest", - &[], - ["status"], - ), - open_shards: new_gauge( - "shards", - "Number of shards hosted by the ingester.", - "ingest", - &[("state", "open")], - ), - closed_shards: new_gauge( - "shards", - "Number of shards hosted by the ingester.", - "ingest", - &[("state", "closed")], - ), - shard_lt_throughput_mib: new_histogram( - "shard_lt_throughput_mib", - "Shard long term throughput as reported through chitchat", - "ingest", - linear_buckets(0.0f64, 1.0f64, 15).unwrap(), - ), - shard_st_throughput_mib: new_histogram( - "shard_st_throughput_mib", - "Shard short term throughput as reported through chitchat", - "ingest", - linear_buckets(0.0f64, 1.0f64, 15).unwrap(), - ), - wal_acquire_lock_requests_in_flight: new_gauge_vec( - "wal_acquire_lock_requests_in_flight", - "Number of acquire lock requests in-flight.", - "ingest", - &[], - ["operation", "type"], - ), - wal_acquire_lock_request_duration_secs: new_histogram_vec( - "wal_acquire_lock_request_duration_secs", - "Duration of acquire lock requests in seconds.", - "ingest", - &[], - ["operation", "type"], - exponential_buckets(0.001, 2.0, 12).unwrap(), - ), - wal_disk_used_bytes: new_gauge( - "wal_disk_used_bytes", - "WAL disk space used in bytes.", - "ingest", - &[], - ), - wal_memory_used_bytes: new_gauge( - "wal_memory_used_bytes", - "WAL memory used in bytes.", - "ingest", - &[], - ), - } - } -} +pub(super) static INGEST_RESULT_UNSPECIFIED: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "unspecified")); + +pub(super) static INGEST_RESULT_INDEX_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "index_not_found")); + +pub(super) static INGEST_RESULT_SOURCE_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "source_not_found")); + +pub(super) static INGEST_RESULT_INTERNAL: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "internal")); + +pub(super) static INGEST_RESULT_NO_SHARDS_AVAILABLE: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "no_shards_available")); + +pub(super) static INGEST_RESULT_SHARD_RATE_LIMITED: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "shard_rate_limited")); + +pub(super) static INGEST_RESULT_WAL_FULL: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "wal_full")); + +pub(super) static INGEST_RESULT_TIMEOUT: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "timeout")); + +pub(super) static INGEST_RESULT_ROUTER_TIMEOUT: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "router_timeout")); + +pub(super) static INGEST_RESULT_ROUTER_LOAD_SHEDDING: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "router_load_shedding")); + +pub(super) static INGEST_RESULT_LOAD_SHEDDING: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "load_shedding")); + +pub(super) static INGEST_RESULT_SHARD_NOT_FOUND: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "shard_not_found")); + +pub(super) static INGEST_RESULT_UNAVAILABLE: LazyLock = + LazyLock::new(|| counter!(parent: &*INGEST_RESULT_TOTAL, "result" => "unavailable")); + +pub(super) static INGEST_ATTEMPTS: LazyLock = LazyLock::new(|| { + counter!( + name: "ingest_attempts", + description: "Number of routing attempts by AZ locality", + subsystem: "ingest", + ) +}); + +pub(super) const AZ_ROUTING_LABELS: Labels<1> = Labels::new(["az_routing"]); + +pub(super) static RESET_SHARDS_OPERATIONS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "reset_shards_operations_total", + description: "Total number of reset shards operations performed.", + subsystem: "ingest", + ) +}); + +static SHARDS: LazyLock = LazyLock::new(|| { + gauge!( + name: "shards", + description: "Number of shards hosted by the ingester.", + subsystem: "ingest", + ) +}); + +pub(super) static OPEN_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*SHARDS, "state" => "open")); + +pub(super) static CLOSED_SHARDS: LazyLock = + LazyLock::new(|| gauge!(parent: &*SHARDS, "state" => "closed")); + +pub(super) static SHARD_LT_THROUGHPUT_MIB: LazyLock = LazyLock::new(|| { + histogram!( + name: "shard_lt_throughput_mib", + description: "Shard long term throughput as reported through chitchat", + subsystem: "ingest", + buckets: linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ) +}); + +pub(super) static SHARD_ST_THROUGHPUT_MIB: LazyLock = LazyLock::new(|| { + histogram!( + name: "shard_st_throughput_mib", + description: "Shard short term throughput as reported through chitchat", + subsystem: "ingest", + buckets: linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ) +}); + +pub(super) static WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_acquire_lock_requests_in_flight", + description: "Number of acquire lock requests in-flight.", + subsystem: "ingest", + ) +}); + +pub(super) static WAL_ACQUIRE_LOCK_REQUEST_DURATION_SECS: LazyLock = + LazyLock::new(|| { + histogram!( + name: "wal_acquire_lock_request_duration_secs", + description: "Duration of acquire lock requests in seconds.", + subsystem: "ingest", + buckets: exponential_buckets(0.001, 2.0, 12).unwrap(), + ) + }); + +pub(super) const WAL_LOCK_METRIC_LABELS: Labels<2> = Labels::new(["operation", "type"]); + +pub(super) static WAL_DISK_USED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_disk_used_bytes", + description: "WAL disk space used in bytes.", + subsystem: "ingest", + ) +}); + +pub(super) static WAL_MEMORY_USED_BYTES: LazyLock = LazyLock::new(|| { + gauge!( + name: "wal_memory_used_bytes", + description: "WAL memory used in bytes.", + subsystem: "ingest", + ) +}); pub(super) fn report_wal_usage(wal_usage: ResourceUsage) { - INGEST_V2_METRICS - .wal_disk_used_bytes - .set(wal_usage.disk_used_bytes as i64); - quickwit_common::metrics::MEMORY_METRICS - .in_flight - .wal - .set(wal_usage.memory_allocated_bytes as i64); - INGEST_V2_METRICS - .wal_memory_used_bytes - .set(wal_usage.memory_used_bytes as i64); + WAL_DISK_USED_BYTES.set(wal_usage.disk_used_bytes as f64); + quickwit_common::metrics::IN_FLIGHT_WAL.set(wal_usage.memory_allocated_bytes as f64); + WAL_MEMORY_USED_BYTES.set(wal_usage.memory_used_bytes as f64); } - -pub(super) static INGEST_V2_METRICS: LazyLock = - LazyLock::new(IngestV2Metrics::default); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs index bbf0cd037c5..adf562f9bc1 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs @@ -18,8 +18,8 @@ use std::time::{Duration, Instant}; use bytesize::ByteSize; use futures::{Future, StreamExt}; use mrecordlog::error::CreateQueueError; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_common::{ServiceStream, rate_limited_warn}; +use quickwit_metrics::GaugeGuard; use quickwit_proto::ingest::ingester::{ AckReplicationMessage, IngesterStatus, InitReplicaRequest, InitReplicaResponse, ReplicateFailure, ReplicateFailureReason, ReplicateRequest, ReplicateResponse, @@ -38,7 +38,6 @@ use super::models::IngesterShard; use super::mrecordlog_utils::check_enough_capacity; use super::state::IngesterState; use crate::ingest_v2::mrecordlog_utils::{AppendDocBatchError, append_non_empty_doc_batch}; -use crate::metrics::INGEST_METRICS; use crate::{estimate_size, with_lock_metrics}; pub(super) const SYN_REPLICATION_STREAM_CAPACITY: usize = 5; @@ -504,8 +503,9 @@ impl ReplicationTask { ))); } let request_size_bytes = replicate_request.num_bytes(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingester_replicate); - gauge_guard.add(request_size_bytes as i64); + let mut _gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INGESTER_REPLICATE); + _gauge_guard.increment(request_size_bytes as f64); self.current_replication_seqno += 1; @@ -667,12 +667,8 @@ impl ReplicationTask { .expect("replica shard should be initialized") .set_replication_position_inclusive(current_position_inclusive.clone(), now); - INGEST_METRICS - .replicated_num_bytes_total - .inc_by(batch_num_bytes); - INGEST_METRICS - .replicated_num_docs_total - .inc_by(batch_num_docs); + crate::metrics::REPLICATED_NUM_BYTES_TOTAL.increment(batch_num_bytes); + crate::metrics::REPLICATED_NUM_DOCS_TOTAL.increment(batch_num_docs); let replicate_success = ReplicateSuccess { subrequest_id: subrequest.subrequest_id, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index e249dd1e0fe..fb03c76609d 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -20,9 +20,9 @@ use std::time::Duration; use async_trait::async_trait; use futures::stream::FuturesUnordered; use futures::{Future, StreamExt}; -use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_common::pubsub::{EventBroker, EventSubscriber}; use quickwit_common::{rate_limited_error, rate_limited_warn}; +use quickwit_metrics::{GaugeGuard, counter}; use quickwit_proto::control_plane::{ ControlPlaneService, ControlPlaneServiceClient, GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest, @@ -45,12 +45,10 @@ use super::debouncing::{ DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer, }; use super::ingester::PERSIST_REQUEST_TIMEOUT; -use super::metrics::IngestResultMetrics; use super::routing_table::RoutingTable; use super::workbench::IngestWorkbench; -use super::{IngesterPool, pending_subrequests}; +use super::{IngesterPool, metrics, pending_subrequests}; use crate::get_ingest_router_buffer_size; -use crate::ingest_v2::metrics::INGEST_V2_METRICS; /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`]. fn ingest_request_timeout() -> Duration { @@ -371,10 +369,12 @@ impl IngestRouter { let az_locality = state_guard .routing_table .classify_az_locality(&ingester_node.node_id, &self.ingester_pool); - INGEST_V2_METRICS - .ingest_attempts - .with_label_values([az_locality]) - .inc(); + let labels = crate::ingest_v2::metrics::AZ_ROUTING_LABELS.with_values([az_locality]); + counter!( + parent: &crate::ingest_v2::metrics::INGEST_ATTEMPTS, + labels: &labels, + ) + .increment(1); let persist_subrequest = PersistSubrequest { subrequest_id: subrequest.subrequest_id, index_uid: Some(ingester_node.index_uid.clone()), @@ -492,82 +492,73 @@ impl IngestRouter { fn update_ingest_metrics(ingest_result: &IngestV2Result, num_subrequests: usize) { let num_subrequests = num_subrequests as u64; - let ingest_results_metrics: &IngestResultMetrics = &INGEST_V2_METRICS.ingest_results; match ingest_result { Ok(ingest_response) => { - ingest_results_metrics - .success - .inc_by(ingest_response.successes.len() as u64); + metrics::INGEST_RESULT_SUCCESS.increment(ingest_response.successes.len() as u64); for ingest_failure in &ingest_response.failures { match ingest_failure.reason() { IngestFailureReason::CircuitBreaker => { - ingest_results_metrics.circuit_breaker.inc(); + metrics::INGEST_RESULT_CIRCUIT_BREAKER.increment(1); + } + IngestFailureReason::Unspecified => { + metrics::INGEST_RESULT_UNSPECIFIED.increment(1) } - IngestFailureReason::Unspecified => ingest_results_metrics.unspecified.inc(), IngestFailureReason::IndexNotFound => { - ingest_results_metrics.index_not_found.inc() + metrics::INGEST_RESULT_INDEX_NOT_FOUND.increment(1) } IngestFailureReason::SourceNotFound => { - ingest_results_metrics.source_not_found.inc() + metrics::INGEST_RESULT_SOURCE_NOT_FOUND.increment(1) } - IngestFailureReason::Internal => ingest_results_metrics.internal.inc(), + IngestFailureReason::Internal => metrics::INGEST_RESULT_INTERNAL.increment(1), IngestFailureReason::NoShardsAvailable => { - ingest_results_metrics.no_shards_available.inc() + metrics::INGEST_RESULT_NO_SHARDS_AVAILABLE.increment(1) } IngestFailureReason::ShardRateLimited => { - ingest_results_metrics.shard_rate_limited.inc() + metrics::INGEST_RESULT_SHARD_RATE_LIMITED.increment(1) } - IngestFailureReason::WalFull => ingest_results_metrics.wal_full.inc(), - IngestFailureReason::Timeout => ingest_results_metrics.timeout.inc(), + IngestFailureReason::WalFull => metrics::INGEST_RESULT_WAL_FULL.increment(1), + IngestFailureReason::Timeout => metrics::INGEST_RESULT_TIMEOUT.increment(1), IngestFailureReason::RouterLoadShedding => { - ingest_results_metrics.router_load_shedding.inc() + metrics::INGEST_RESULT_ROUTER_LOAD_SHEDDING.increment(1) + } + IngestFailureReason::LoadShedding => { + metrics::INGEST_RESULT_LOAD_SHEDDING.increment(1) } - IngestFailureReason::LoadShedding => ingest_results_metrics.load_shedding.inc(), } } } Err(ingest_error) => match ingest_error { IngestV2Error::TooManyRequests(rate_limiting_cause) => match rate_limiting_cause { RateLimitingCause::RouterLoadShedding => { - ingest_results_metrics - .router_load_shedding - .inc_by(num_subrequests); + metrics::INGEST_RESULT_ROUTER_LOAD_SHEDDING.increment(num_subrequests); } RateLimitingCause::LoadShedding => { - ingest_results_metrics.load_shedding.inc_by(num_subrequests) + metrics::INGEST_RESULT_LOAD_SHEDDING.increment(num_subrequests) } RateLimitingCause::WalFull => { - ingest_results_metrics.wal_full.inc_by(num_subrequests); + metrics::INGEST_RESULT_WAL_FULL.increment(num_subrequests); } RateLimitingCause::CircuitBreaker => { - ingest_results_metrics - .circuit_breaker - .inc_by(num_subrequests); + metrics::INGEST_RESULT_CIRCUIT_BREAKER.increment(num_subrequests); } RateLimitingCause::ShardRateLimiting => { - ingest_results_metrics - .shard_rate_limited - .inc_by(num_subrequests); + metrics::INGEST_RESULT_SHARD_RATE_LIMITED.increment(num_subrequests); } RateLimitingCause::Unknown => { - ingest_results_metrics.unspecified.inc_by(num_subrequests); + metrics::INGEST_RESULT_UNSPECIFIED.increment(num_subrequests); } }, IngestV2Error::Timeout(_) => { - ingest_results_metrics - .router_timeout - .inc_by(num_subrequests); + metrics::INGEST_RESULT_ROUTER_TIMEOUT.increment(num_subrequests); } IngestV2Error::ShardNotFound { .. } => { - ingest_results_metrics - .shard_not_found - .inc_by(num_subrequests); + metrics::INGEST_RESULT_SHARD_NOT_FOUND.increment(num_subrequests); } IngestV2Error::Unavailable(_) => { - ingest_results_metrics.unavailable.inc_by(num_subrequests); + metrics::INGEST_RESULT_UNAVAILABLE.increment(num_subrequests); } IngestV2Error::Internal(_) => { - ingest_results_metrics.internal.inc_by(num_subrequests); + metrics::INGEST_RESULT_INTERNAL.increment(num_subrequests); } }, } @@ -578,8 +569,9 @@ impl IngestRouterService for IngestRouter { async fn ingest(&self, ingest_request: IngestRequestV2) -> IngestV2Result { let request_size_bytes = ingest_request.num_bytes(); - let mut gauge_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.ingest_router); - gauge_guard.add(request_size_bytes as i64); + let mut _gauge_guard = + GaugeGuard::from_gauge(&quickwit_common::metrics::IN_FLIGHT_INGEST_ROUTER); + _gauge_guard.increment(request_size_bytes as f64); let num_subrequests = ingest_request.subrequests.len(); let _permit = self diff --git a/quickwit/quickwit-ingest/src/lib.rs b/quickwit/quickwit-ingest/src/lib.rs index f021f4888c1..07e96f75731 100644 --- a/quickwit/quickwit-ingest/src/lib.rs +++ b/quickwit/quickwit-ingest/src/lib.rs @@ -107,12 +107,15 @@ pub async fn start_ingest_api_service( #[macro_export] macro_rules! with_lock_metrics { - ($future:expr, $($label:tt),*) => { + ($future:expr, $operation:expr, $kind:expr) => { { - $crate::ingest_v2::metrics::INGEST_V2_METRICS - .wal_acquire_lock_requests_in_flight - .with_label_values([$($label),*]) - .inc(); + let labels = + $crate::ingest_v2::metrics::WAL_LOCK_METRIC_LABELS.with_values([$operation, $kind]); + quickwit_metrics::gauge!( + parent: &$crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT, + labels: &labels, + ) + .increment(1.0); let now = std::time::Instant::now(); let guard = $future; @@ -124,14 +127,16 @@ macro_rules! with_lock_metrics { "lock acquisition took {}ms", elapsed.as_millis() ); } - $crate::ingest_v2::metrics::INGEST_V2_METRICS - .wal_acquire_lock_requests_in_flight - .with_label_values([$($label),*]) - .dec(); - $crate::ingest_v2::metrics::INGEST_V2_METRICS - .wal_acquire_lock_request_duration_secs - .with_label_values([$($label),*]) - .observe(elapsed.as_secs_f64()); + quickwit_metrics::gauge!( + parent: &$crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUESTS_IN_FLIGHT, + labels: &labels, + ) + .decrement(1.0); + quickwit_metrics::histogram!( + parent: &$crate::ingest_v2::metrics::WAL_ACQUIRE_LOCK_REQUEST_DURATION_SECS, + labels: &labels, + ) + .record(elapsed.as_secs_f64()); guard } diff --git a/quickwit/quickwit-ingest/src/metrics.rs b/quickwit/quickwit-ingest/src/metrics.rs index 7b6888243e5..fbb63e47df2 100644 --- a/quickwit/quickwit-ingest/src/metrics.rs +++ b/quickwit/quickwit-ingest/src/metrics.rs @@ -14,69 +14,45 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntCounter, IntGauge, new_counter, new_counter_vec, new_gauge}; - -pub struct IngestMetrics { - pub ingested_docs_bytes_valid: IntCounter, - pub ingested_docs_bytes_invalid: IntCounter, - pub ingested_docs_invalid: IntCounter, - pub ingested_docs_valid: IntCounter, - - pub replicated_num_bytes_total: IntCounter, - pub replicated_num_docs_total: IntCounter, - #[allow(dead_code)] // this really shouldn't be dead, it needs to be used somewhere - pub queue_count: IntGauge, -} - -impl Default for IngestMetrics { - fn default() -> Self { - let ingest_docs_bytes_total = new_counter_vec( - "docs_bytes_total", - "Total size of the docs ingested, measured in ingester's leader, after validation and \ - before persistence/replication", - "ingest", - &[], - ["validity"], - ); - let ingested_docs_bytes_valid = ingest_docs_bytes_total.with_label_values(["valid"]); - let ingested_docs_bytes_invalid = ingest_docs_bytes_total.with_label_values(["invalid"]); - - let ingest_docs_total = new_counter_vec( - "docs_total", - "Total number of the docs ingested, measured in ingester's leader, after validation \ - and before persistence/replication", - "ingest", - &[], - ["validity"], - ); - let ingested_docs_valid = ingest_docs_total.with_label_values(["valid"]); - let ingested_docs_invalid = ingest_docs_total.with_label_values(["invalid"]); - - IngestMetrics { - ingested_docs_bytes_valid, - ingested_docs_bytes_invalid, - ingested_docs_valid, - ingested_docs_invalid, - replicated_num_bytes_total: new_counter( - "replicated_num_bytes_total", - "Total size in bytes of the replicated docs.", - "ingest", - &[], - ), - replicated_num_docs_total: new_counter( - "replicated_num_docs_total", - "Total number of docs replicated.", - "ingest", - &[], - ), - queue_count: new_gauge( - "queue_count", - "Number of queues currently active", - "ingest", - &[], - ), - } - } -} - -pub static INGEST_METRICS: LazyLock = LazyLock::new(IngestMetrics::default); +use quickwit_metrics::{Counter, Gauge, counter, gauge}; + +pub(crate) static DOCS_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "docs_bytes_total", + description: "Total size of the docs ingested, measured in ingester's leader, after validation and before persistence/replication", + subsystem: "ingest", + ) +}); + +pub(crate) static DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "docs_total", + description: "Total number of the docs ingested, measured in ingester's leader, after validation and before persistence/replication", + subsystem: "ingest", + ) +}); + +pub(crate) static REPLICATED_NUM_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "replicated_num_bytes_total", + description: "Total size in bytes of the replicated docs.", + subsystem: "ingest", + ) +}); + +pub(crate) static REPLICATED_NUM_DOCS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "replicated_num_docs_total", + description: "Total number of docs replicated.", + subsystem: "ingest", + ) +}); + +#[allow(dead_code)] // this really shouldn't be dead, it needs to be used somewhere +pub(crate) static QUEUE_COUNT: LazyLock = LazyLock::new(|| { + gauge!( + name: "queue_count", + description: "Number of queues currently active", + subsystem: "ingest", + ) +}); diff --git a/quickwit/quickwit-jaeger/Cargo.toml b/quickwit/quickwit-jaeger/Cargo.toml index 1ebebc8dbfb..a2d686c3f7a 100644 --- a/quickwit/quickwit-jaeger/Cargo.toml +++ b/quickwit/quickwit-jaeger/Cargo.toml @@ -26,6 +26,7 @@ tonic = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-opentelemetry = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-jaeger/src/lib.rs b/quickwit/quickwit-jaeger/src/lib.rs index 1b6dfc27d0c..caac5c4c2e9 100644 --- a/quickwit/quickwit-jaeger/src/lib.rs +++ b/quickwit/quickwit-jaeger/src/lib.rs @@ -22,6 +22,7 @@ use itertools::{Either, Itertools}; use prost::Message; use prost_types::{Duration as WellKnownDuration, Timestamp as WellKnownTimestamp}; use quickwit_config::JaegerConfig; +use quickwit_metrics::{counter, histogram}; use quickwit_opentelemetry::otlp::{ Event as QwEvent, Link as QwLink, OTEL_TRACES_INDEX_ID, Span as QwSpan, SpanFingerprint, SpanId, SpanKind as QwSpanKind, SpanStatus as QwSpanStatus, TraceId, @@ -51,8 +52,6 @@ use tonic::Status; use tracing::field::Empty; use tracing::{Span as RuntimeSpan, debug, error, instrument, warn}; -pub(crate) use crate::metrics::JAEGER_SERVICE_METRICS; - mod metrics; mod v1; mod v2; @@ -415,43 +414,52 @@ impl JaegerService { current_span.record("num_spans", num_spans_total); current_span.record("num_bytes", num_bytes_total); - JAEGER_SERVICE_METRICS - .fetched_traces_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_traces); + let labels = crate::metrics::OPERATION_INDEX_LABELS + .with_values([operation_name, OTEL_TRACES_INDEX_ID]); + counter!(parent: &crate::metrics::FETCHED_TRACES_TOTAL, labels: &labels) + .increment(num_traces); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "false"]) - .observe(elapsed); + let duration_labels = crate::metrics::OPERATION_INDEX_ERROR_LABELS.with_values([ + operation_name, + OTEL_TRACES_INDEX_ID, + "false", + ]); + histogram!( + parent: &crate::metrics::REQUEST_DURATION_SECONDS, + labels: &duration_labels, + ) + .record(elapsed); }); Ok(ReceiverStream::new(rx)) } } pub(crate) fn record_error(operation_name: &'static str, request_start: Instant) { - JAEGER_SERVICE_METRICS - .request_errors_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc(); + let labels = + crate::metrics::OPERATION_INDEX_LABELS.with_values([operation_name, OTEL_TRACES_INDEX_ID]); + counter!(parent: &crate::metrics::REQUEST_ERRORS_TOTAL, labels: &labels).increment(1); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "true"]) - .observe(elapsed); + let duration_labels = crate::metrics::OPERATION_INDEX_ERROR_LABELS.with_values([ + operation_name, + OTEL_TRACES_INDEX_ID, + "true", + ]); + histogram!( + parent: &crate::metrics::REQUEST_DURATION_SECONDS, + labels: &duration_labels, + ) + .record(elapsed); } pub(crate) fn record_send(operation_name: &'static str, num_spans: usize, num_bytes: usize) { - JAEGER_SERVICE_METRICS - .fetched_spans_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_spans as u64); - JAEGER_SERVICE_METRICS - .transferred_bytes_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(num_bytes as u64); + let labels = + crate::metrics::OPERATION_INDEX_LABELS.with_values([operation_name, OTEL_TRACES_INDEX_ID]); + counter!(parent: &crate::metrics::FETCHED_SPANS_TOTAL, labels: &labels) + .increment(num_spans as u64); + counter!(parent: &crate::metrics::TRANSFERRED_BYTES_TOTAL, labels: &labels) + .increment(num_bytes as u64); } #[allow(deprecated)] diff --git a/quickwit/quickwit-jaeger/src/metrics.rs b/quickwit/quickwit-jaeger/src/metrics.rs index 3095b68b59f..4a53c75f41e 100644 --- a/quickwit/quickwit-jaeger/src/metrics.rs +++ b/quickwit/quickwit-jaeger/src/metrics.rs @@ -14,68 +14,58 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - HistogramVec, IntCounterVec, exponential_buckets, new_counter_vec, new_histogram_vec, -}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Counter, Histogram, Labels, counter, histogram}; -pub struct JaegerServiceMetrics { - pub requests_total: IntCounterVec<2>, - pub request_errors_total: IntCounterVec<2>, - pub request_duration_seconds: HistogramVec<3>, - pub fetched_traces_total: IntCounterVec<2>, - pub fetched_spans_total: IntCounterVec<2>, - pub transferred_bytes_total: IntCounterVec<2>, -} +pub(crate) const OPERATION_INDEX_LABELS: Labels<2> = Labels::new(["operation", "index"]); +pub(crate) const OPERATION_INDEX_ERROR_LABELS: Labels<3> = + Labels::new(["operation", "index", "error"]); -impl Default for JaegerServiceMetrics { - fn default() -> Self { - Self { - requests_total: new_counter_vec( - "requests_total", - "Number of requests", - "jaeger", - &[], - ["operation", "index"], - ), - request_errors_total: new_counter_vec( - "request_errors_total", - "Number of failed requests", - "jaeger", - &[], - ["operation", "index"], - ), - request_duration_seconds: new_histogram_vec( - "request_duration_seconds", - "Duration of requests", - "jaeger", - &[], - ["operation", "index", "error"], - exponential_buckets(0.02, 2.0, 8).unwrap(), - ), - fetched_traces_total: new_counter_vec( - "fetched_traces_total", - "Number of traces retrieved from storage", - "jaeger", - &[], - ["operation", "index"], - ), - fetched_spans_total: new_counter_vec( - "fetched_spans_total", - "Number of spans retrieved from storage", - "jaeger", - &[], - ["operation", "index"], - ), - transferred_bytes_total: new_counter_vec( - "transferred_bytes_total", - "Number of bytes transferred", - "jaeger", - &[], - ["operation", "index"], - ), - } - } -} +pub(crate) static REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "requests_total", + description: "Number of requests", + subsystem: "jaeger", + ) +}); -pub static JAEGER_SERVICE_METRICS: LazyLock = - LazyLock::new(JaegerServiceMetrics::default); +pub(crate) static REQUEST_ERRORS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "request_errors_total", + description: "Number of failed requests", + subsystem: "jaeger", + ) +}); + +pub(crate) static REQUEST_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "request_duration_seconds", + description: "Duration of requests", + subsystem: "jaeger", + buckets: exponential_buckets(0.02, 2.0, 8).unwrap(), + ) +}); + +pub(crate) static FETCHED_TRACES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "fetched_traces_total", + description: "Number of traces retrieved from storage", + subsystem: "jaeger", + ) +}); + +pub(crate) static FETCHED_SPANS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "fetched_spans_total", + description: "Number of spans retrieved from storage", + subsystem: "jaeger", + ) +}); + +pub(crate) static TRANSFERRED_BYTES_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "transferred_bytes_total", + description: "Number of bytes transferred", + subsystem: "jaeger", + ) +}); diff --git a/quickwit/quickwit-jaeger/src/v1.rs b/quickwit/quickwit-jaeger/src/v1.rs index 11d6935db4e..dff0b08e26b 100644 --- a/quickwit/quickwit-jaeger/src/v1.rs +++ b/quickwit/quickwit-jaeger/src/v1.rs @@ -17,6 +17,7 @@ use std::time::Instant; use async_trait::async_trait; +use quickwit_metrics::{counter, histogram}; use quickwit_opentelemetry::otlp::{ OTEL_TRACES_INDEX_ID, extract_otel_traces_index_id_patterns_from_metadata, }; @@ -27,26 +28,40 @@ use quickwit_proto::jaeger::storage::v1::{ }; use tonic::{Request, Response, Status}; -use crate::metrics::JAEGER_SERVICE_METRICS; use crate::{JaegerService, SpanStream}; macro_rules! metrics { - ($expr:expr, [$operation:ident, $($label:expr),*]) => { + ($expr:expr, [$operation:ident, $index:expr]) => { let start = std::time::Instant::now(); - let labels = [stringify!($operation), $($label,)*]; - JAEGER_SERVICE_METRICS.requests_total.with_label_values(labels).inc(); + let operation = stringify!($operation); + let index = $index; + let labels = crate::metrics::OPERATION_INDEX_LABELS.with_values([operation, index]); + counter!( + parent: &crate::metrics::REQUESTS_TOTAL, + labels: &labels, + ) + .increment(1); let (res, is_error) = match $expr { ok @ Ok(_) => { (ok, "false") }, err @ Err(_) => { - JAEGER_SERVICE_METRICS.request_errors_total.with_label_values(labels).inc(); + counter!( + parent: &crate::metrics::REQUEST_ERRORS_TOTAL, + labels: &labels, + ) + .increment(1); (err, "true") }, }; let elapsed = start.elapsed().as_secs_f64(); - let labels = [stringify!($operation), $($label,)* is_error]; - JAEGER_SERVICE_METRICS.request_duration_seconds.with_label_values(labels).observe(elapsed); + let duration_labels = + crate::metrics::OPERATION_INDEX_ERROR_LABELS.with_values([operation, index, is_error]); + histogram!( + parent: &crate::metrics::REQUEST_DURATION_SECONDS, + labels: &duration_labels, + ) + .record(elapsed); return res.map(Response::new); }; diff --git a/quickwit/quickwit-jaeger/src/v2.rs b/quickwit/quickwit-jaeger/src/v2.rs index e355c18a8c3..4e8f4cf8532 100644 --- a/quickwit/quickwit-jaeger/src/v2.rs +++ b/quickwit/quickwit-jaeger/src/v2.rs @@ -19,6 +19,7 @@ use std::time::Instant; use async_trait::async_trait; use prost_types::Timestamp as WellKnownTimestamp; +use quickwit_metrics::{counter, histogram}; use quickwit_opentelemetry::otlp::{ OTEL_TRACES_INDEX_ID, Span as QwSpan, TraceId, extract_otel_traces_index_id_patterns_from_metadata, @@ -50,29 +51,43 @@ use tonic::{Request, Response, Status}; use tracing::field::Empty; use tracing::{Span as RuntimeSpan, debug, error, instrument}; -use crate::metrics::JAEGER_SERVICE_METRICS; use crate::{ JaegerService, TimeIntervalSecs, TracesDataStream, get_operations_impl, get_services_impl, json_deserialize, record_error, record_send, to_duration_millis, }; macro_rules! metrics { - ($expr:expr, [$operation:ident, $($label:expr),*]) => { + ($expr:expr, [$operation:ident, $index:expr]) => { let start = std::time::Instant::now(); - let labels = [stringify!($operation), $($label,)*]; - JAEGER_SERVICE_METRICS.requests_total.with_label_values(labels).inc(); + let operation = stringify!($operation); + let index = $index; + let labels = crate::metrics::OPERATION_INDEX_LABELS.with_values([operation, index]); + counter!( + parent: &crate::metrics::REQUESTS_TOTAL, + labels: &labels, + ) + .increment(1); let (res, is_error) = match $expr { ok @ Ok(_) => { (ok, "false") }, err @ Err(_) => { - JAEGER_SERVICE_METRICS.request_errors_total.with_label_values(labels).inc(); + counter!( + parent: &crate::metrics::REQUEST_ERRORS_TOTAL, + labels: &labels, + ) + .increment(1); (err, "true") }, }; let elapsed = start.elapsed().as_secs_f64(); - let labels = [stringify!($operation), $($label,)* is_error]; - JAEGER_SERVICE_METRICS.request_duration_seconds.with_label_values(labels).observe(elapsed); + let duration_labels = + crate::metrics::OPERATION_INDEX_ERROR_LABELS.with_values([operation, index, is_error]); + histogram!( + parent: &crate::metrics::REQUEST_DURATION_SECONDS, + labels: &duration_labels, + ) + .record(elapsed); return res.map(Response::new); }; @@ -426,16 +441,22 @@ async fn stream_otel_spans_impl( record_send(operation_name, num_spans, num_bytes); - JAEGER_SERVICE_METRICS - .fetched_traces_total - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID]) - .inc_by(trace_ids.len() as u64); + let labels = + crate::metrics::OPERATION_INDEX_LABELS.with_values([operation_name, OTEL_TRACES_INDEX_ID]); + counter!(parent: &crate::metrics::FETCHED_TRACES_TOTAL, labels: &labels) + .increment(trace_ids.len() as u64); let elapsed = request_start.elapsed().as_secs_f64(); - JAEGER_SERVICE_METRICS - .request_duration_seconds - .with_label_values([operation_name, OTEL_TRACES_INDEX_ID, "false"]) - .observe(elapsed); + let duration_labels = crate::metrics::OPERATION_INDEX_ERROR_LABELS.with_values([ + operation_name, + OTEL_TRACES_INDEX_ID, + "false", + ]); + histogram!( + parent: &crate::metrics::REQUEST_DURATION_SECONDS, + labels: &duration_labels, + ) + .record(elapsed); Ok(qw_spans) } diff --git a/quickwit/quickwit-janitor/Cargo.toml b/quickwit/quickwit-janitor/Cargo.toml index ecb243a9990..e8063895f24 100644 --- a/quickwit/quickwit-janitor/Cargo.toml +++ b/quickwit/quickwit-janitor/Cargo.toml @@ -26,6 +26,7 @@ utoipa = { workspace = true } quickwit-actors = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } diff --git a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs index 5e08b7773e6..b7b17845c57 100644 --- a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs +++ b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs @@ -26,6 +26,7 @@ use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_indexing::actors::{MergeSchedulerService, MergeSplitDownloader, schedule_merge}; use quickwit_indexing::merge_policy::MergeOperation; use quickwit_metastore::{ListSplitsResponseExt, Split, split_tag_filter, split_time_range_filter}; +use quickwit_metrics::gauge; use quickwit_proto::metastore::{ DeleteTask, LastDeleteOpstampRequest, ListDeleteTasksRequest, ListStaleSplitsRequest, MetastoreResult, MetastoreService, MetastoreServiceClient, UpdateSplitsDeleteOpstampRequest, @@ -37,8 +38,6 @@ use serde::Serialize; use tantivy::Inventory; use tracing::{debug, info}; -use crate::metrics::JANITOR_METRICS; - const PLANNER_REFRESH_INTERVAL: Duration = Duration::from_secs(60); const NUM_STALE_SPLITS_TO_FETCH: usize = 1000; @@ -205,11 +204,14 @@ impl DeleteTaskPlanner { ) .await?; let index_label = - quickwit_common::metrics::index_label(self.index_uid.index_id.as_str()); - JANITOR_METRICS - .ongoing_num_delete_operations_total - .with_label_values([index_label]) - .set(self.ongoing_delete_operations_inventory.list().len() as i64); + quickwit_common::metrics::index_label(self.index_uid.index_id.as_str()) + .to_string(); + let labels = crate::metrics::INDEX_LABELS.with_values([index_label]); + gauge!( + parent: &crate::metrics::ONGOING_NUM_DELETE_OPERATIONS_TOTAL, + labels: &labels, + ) + .set(self.ongoing_delete_operations_inventory.list().len() as f64); } } diff --git a/quickwit/quickwit-janitor/src/actors/garbage_collector.rs b/quickwit/quickwit-janitor/src/actors/garbage_collector.rs index 21411bb0192..7f0df9d89f6 100644 --- a/quickwit/quickwit-janitor/src/actors/garbage_collector.rs +++ b/quickwit/quickwit-janitor/src/actors/garbage_collector.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -23,6 +24,7 @@ use quickwit_common::is_parquet_pipeline_index; use quickwit_common::shared_consts::split_deletion_grace_period; use quickwit_index_management::{GcMetrics, run_garbage_collect, run_parquet_garbage_collect}; use quickwit_metastore::ListIndexesMetadataResponseExt; +use quickwit_metrics::counter; use quickwit_proto::metastore::{ ListIndexesMetadataRequest, MetastoreService, MetastoreServiceClient, }; @@ -31,8 +33,6 @@ use quickwit_storage::{Storage, StorageResolver}; use serde::Serialize; use tracing::{debug, error, info}; -use crate::metrics::JANITOR_METRICS; - const RUN_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes /// Result of a GC run (tantivy or parquet). @@ -55,19 +55,25 @@ impl GcRunResult { } fn gc_metrics(split_type: &str) -> GcMetrics { + let split_type = split_type.to_string(); + let success_labels = crate::metrics::GC_RESULT_SPLIT_TYPE_LABELS + .with_values([Cow::Borrowed("success"), Cow::Owned(split_type.clone())]); + let split_type_labels = crate::metrics::GC_SPLIT_TYPE_LABELS.with_values([split_type.clone()]); + let error_labels = crate::metrics::GC_RESULT_SPLIT_TYPE_LABELS + .with_values([Cow::Borrowed("error"), Cow::Owned(split_type)]); GcMetrics { - deleted_splits: JANITOR_METRICS - .gc_deleted_splits - .with_label_values(["success", split_type]) - .clone(), - deleted_bytes: JANITOR_METRICS - .gc_deleted_bytes - .with_label_values([split_type]) - .clone(), - failed_splits: JANITOR_METRICS - .gc_deleted_splits - .with_label_values(["error", split_type]) - .clone(), + deleted_splits: counter!( + parent: &crate::metrics::GC_DELETED_SPLITS, + labels: &success_labels, + ), + deleted_bytes: counter!( + parent: &crate::metrics::GC_DELETED_BYTES, + labels: &split_type_labels, + ), + failed_splits: counter!( + parent: &crate::metrics::GC_DELETED_SPLITS, + labels: &error_labels, + ), } } @@ -202,18 +208,21 @@ impl GarbageCollector { .await; let tantivy_run_duration = tantivy_start.elapsed().as_secs(); - JANITOR_METRICS - .gc_seconds_total - .with_label_values(["tantivy"]) - .inc_by(tantivy_run_duration); + counter!( + parent: &crate::metrics::GC_SECONDS_TOTAL, + "split_type" => "tantivy", + ) + .increment(tantivy_run_duration); let result = match gc_res { Ok(removal_info) => { self.counters.num_successful_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["success", "tantivy"]) - .inc(); + counter!( + parent: &crate::metrics::GC_RUNS, + "result" => "success", + "split_type" => "tantivy", + ) + .increment(1); GcRunResult { num_deleted_splits: removal_info.removed_split_entries.len(), num_deleted_bytes: removal_info @@ -232,10 +241,12 @@ impl GarbageCollector { } Err(error) => { self.counters.num_failed_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["error", "tantivy"]) - .inc(); + counter!( + parent: &crate::metrics::GC_RUNS, + "result" => "error", + "split_type" => "tantivy", + ) + .increment(1); error!(error=?error, "failed to run garbage collection"); GcRunResult::failed() } @@ -258,18 +269,21 @@ impl GarbageCollector { .await; let parquet_run_duration = parquet_start.elapsed().as_secs(); - JANITOR_METRICS - .gc_seconds_total - .with_label_values(["parquet"]) - .inc_by(parquet_run_duration); + counter!( + parent: &crate::metrics::GC_SECONDS_TOTAL, + "split_type" => "parquet", + ) + .increment(parquet_run_duration); let result = match gc_res { Ok(removal_info) => { self.counters.num_successful_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["success", "parquet"]) - .inc(); + counter!( + parent: &crate::metrics::GC_RUNS, + "result" => "success", + "split_type" => "parquet", + ) + .increment(1); GcRunResult { num_deleted_splits: removal_info.removed_split_count(), num_deleted_bytes: removal_info.removed_bytes() as usize, @@ -284,10 +298,12 @@ impl GarbageCollector { } Err(error) => { self.counters.num_failed_gc_run += 1; - JANITOR_METRICS - .gc_runs - .with_label_values(["error", "parquet"]) - .inc(); + counter!( + parent: &crate::metrics::GC_RUNS, + "result" => "error", + "split_type" => "parquet", + ) + .increment(1); error!(error=?error, "failed to run parquet garbage collection"); GcRunResult::failed() } diff --git a/quickwit/quickwit-janitor/src/metrics.rs b/quickwit/quickwit-janitor/src/metrics.rs index aeea26c2674..a6e33562dcb 100644 --- a/quickwit/quickwit-janitor/src/metrics.rs +++ b/quickwit/quickwit-janitor/src/metrics.rs @@ -14,58 +14,50 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntCounterVec, IntGaugeVec, new_counter_vec, new_gauge_vec}; +use quickwit_metrics::{Counter, Gauge, Labels, counter, gauge}; -pub struct JanitorMetrics { - pub ongoing_num_delete_operations_total: IntGaugeVec<1>, - pub gc_deleted_splits: IntCounterVec<2>, - pub gc_deleted_bytes: IntCounterVec<1>, - pub gc_runs: IntCounterVec<2>, - pub gc_seconds_total: IntCounterVec<1>, -} +pub(crate) static ONGOING_NUM_DELETE_OPERATIONS_TOTAL: LazyLock = LazyLock::new(|| { + gauge!( + name: "ongoing_num_delete_operations_total", + description: "Num of ongoing delete operations (per index).", + subsystem: "quickwit_janitor", + ) +}); -impl Default for JanitorMetrics { - fn default() -> Self { - JanitorMetrics { - ongoing_num_delete_operations_total: new_gauge_vec( - "ongoing_num_delete_operations_total", - "Num of ongoing delete operations (per index).", - "quickwit_janitor", - &[], - ["index"], - ), - gc_deleted_splits: new_counter_vec( - "gc_deleted_splits_total", - "Total number of splits deleted by the garbage collector.", - "quickwit_janitor", - &[], - ["result", "split_type"], - ), - gc_deleted_bytes: new_counter_vec( - "gc_deleted_bytes_total", - "Total number of bytes deleted by the garbage collector.", - "quickwit_janitor", - &[], - ["split_type"], - ), - gc_runs: new_counter_vec( - "gc_runs_total", - "Total number of garbage collector execition.", - "quickwit_janitor", - &[], - ["result", "split_type"], - ), - gc_seconds_total: new_counter_vec( - "gc_seconds_total", - "Total time spent running the garbage collector", - "quickwit_janitor", - &[], - ["split_type"], - ), - } - } -} +pub(crate) const INDEX_LABELS: Labels<1> = Labels::new(["index"]); -/// `JANITOR_METRICS` exposes a bunch of related metrics through a prometheus -/// endpoint. -pub static JANITOR_METRICS: LazyLock = LazyLock::new(JanitorMetrics::default); +pub(crate) static GC_DELETED_SPLITS: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_deleted_splits_total", + description: "Total number of splits deleted by the garbage collector.", + subsystem: "quickwit_janitor", + ) +}); + +pub(crate) const GC_RESULT_SPLIT_TYPE_LABELS: Labels<2> = Labels::new(["result", "split_type"]); + +pub(crate) static GC_DELETED_BYTES: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_deleted_bytes_total", + description: "Total number of bytes deleted by the garbage collector.", + subsystem: "quickwit_janitor", + ) +}); + +pub(crate) const GC_SPLIT_TYPE_LABELS: Labels<1> = Labels::new(["split_type"]); + +pub(crate) static GC_RUNS: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_runs_total", + description: "Total number of garbage collector execition.", + subsystem: "quickwit_janitor", + ) +}); + +pub(crate) static GC_SECONDS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "gc_seconds_total", + description: "Total time spent running the garbage collector", + subsystem: "quickwit_janitor", + ) +}); diff --git a/quickwit/quickwit-lambda-client/Cargo.toml b/quickwit/quickwit-lambda-client/Cargo.toml index 9f8318e7c15..1d33060be2e 100644 --- a/quickwit/quickwit-lambda-client/Cargo.toml +++ b/quickwit/quickwit-lambda-client/Cargo.toml @@ -23,6 +23,7 @@ tokio = { workspace = true } tracing = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-lambda-server = { workspace = true } quickwit-proto = { workspace = true } diff --git a/quickwit/quickwit-lambda-client/src/invoker.rs b/quickwit/quickwit-lambda-client/src/invoker.rs index c8ffa0716a0..6b67431fdc5 100644 --- a/quickwit/quickwit-lambda-client/src/invoker.rs +++ b/quickwit/quickwit-lambda-client/src/invoker.rs @@ -25,12 +25,11 @@ use base64::prelude::*; use prost::Message; use quickwit_common::retry::RetryParams; use quickwit_lambda_server::{LambdaSearchRequestPayload, LambdaSearchResponsePayload}; +use quickwit_metrics::{counter, histogram}; use quickwit_proto::search::{LambdaSearchResponses, LambdaSingleSplitResult, LeafSearchRequest}; use quickwit_search::{LambdaLeafSearchInvoker, SearchError}; use tracing::{debug, info, instrument, warn}; -use crate::metrics::LAMBDA_METRICS; - /// Upper bound on the retry-after hint we will honor from Lambda rate-limit responses. const MAX_RETRY_AFTER: Duration = Duration::from_secs(10); @@ -171,14 +170,17 @@ impl LambdaLeafSearchInvoker for AwsLambdaInvoker { let result = self.invoke_leaf_search_with_retry(request).await; let elapsed = start.elapsed().as_secs_f64(); let status = if result.is_ok() { "success" } else { "error" }; - LAMBDA_METRICS - .leaf_search_requests_total - .with_label_values([status]) - .inc(); - LAMBDA_METRICS - .leaf_search_duration_seconds - .with_label_values([status]) - .observe(elapsed); + let labels = crate::metrics::STATUS_LABELS.with_values([status]); + counter!( + parent: &crate::metrics::LEAF_SEARCH_REQUESTS_TOTAL, + labels: &labels, + ) + .increment(1); + histogram!( + parent: &crate::metrics::LEAF_SEARCH_DURATION_SECONDS, + labels: &labels, + ) + .record(elapsed); result } } @@ -232,9 +234,7 @@ impl AwsLambdaInvoker { let payload_json = serde_json::to_vec(&payload) .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?; - LAMBDA_METRICS - .leaf_search_request_payload_size_bytes - .observe(payload_json.len() as f64); + crate::metrics::LEAF_SEARCH_REQUEST_PAYLOAD_SIZE_BYTES.record(payload_json.len() as f64); debug!( payload_size = payload_json.len(), @@ -274,9 +274,8 @@ impl AwsLambdaInvoker { .payload() .ok_or_else(|| SearchError::Internal("no response payload from Lambda".into()))?; - LAMBDA_METRICS - .leaf_search_response_payload_size_bytes - .observe(response_payload.as_ref().len() as f64); + crate::metrics::LEAF_SEARCH_RESPONSE_PAYLOAD_SIZE_BYTES + .record(response_payload.as_ref().len() as f64); let lambda_response: LambdaSearchResponsePayload = serde_json::from_slice(response_payload.as_ref()) diff --git a/quickwit/quickwit-lambda-client/src/lib.rs b/quickwit/quickwit-lambda-client/src/lib.rs index aebf264df8c..70163f06e84 100644 --- a/quickwit/quickwit-lambda-client/src/lib.rs +++ b/quickwit/quickwit-lambda-client/src/lib.rs @@ -32,6 +32,5 @@ mod invoker; mod metrics; pub use deploy::try_get_or_deploy_invoker; -pub use metrics::LAMBDA_METRICS; // Re-export payload types from server crate for convenience pub use quickwit_lambda_server::{LambdaSearchRequestPayload, LambdaSearchResponsePayload}; diff --git a/quickwit/quickwit-lambda-client/src/metrics.rs b/quickwit/quickwit-lambda-client/src/metrics.rs index f136e4249c1..dd2ff87f189 100644 --- a/quickwit/quickwit-lambda-client/src/metrics.rs +++ b/quickwit/quickwit-lambda-client/src/metrics.rs @@ -16,10 +16,10 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{ - Histogram, HistogramVec, IntCounterVec, exponential_buckets, new_counter_vec, new_histogram, - new_histogram_vec, -}; +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{Counter, Histogram, Labels, counter, histogram}; + +pub(crate) const STATUS_LABELS: Labels<1> = Labels::new(["status"]); /// From 100ms to 73s seconds fn duration_buckets() -> Vec { @@ -31,45 +31,39 @@ fn payload_size_buckets() -> Vec { exponential_buckets(1024.0, 4.0, 8).unwrap() } -pub struct LambdaMetrics { - pub leaf_search_requests_total: IntCounterVec<1>, - pub leaf_search_duration_seconds: HistogramVec<1>, - pub leaf_search_request_payload_size_bytes: Histogram, - pub leaf_search_response_payload_size_bytes: Histogram, -} +pub(crate) static LEAF_SEARCH_REQUESTS_TOTAL: LazyLock = LazyLock::new(|| { + counter!( + name: "leaf_search_requests_total", + description: "Total number of Lambda leaf search invocations.", + subsystem: "lambda", + ) +}); -impl Default for LambdaMetrics { - fn default() -> Self { - LambdaMetrics { - leaf_search_requests_total: new_counter_vec( - "leaf_search_requests_total", - "Total number of Lambda leaf search invocations.", - "lambda", - &[], - ["status"], - ), - leaf_search_duration_seconds: new_histogram_vec( - "leaf_search_duration_seconds", - "Duration of Lambda leaf search invocations in seconds.", - "lambda", - &[], - ["status"], - duration_buckets(), - ), - leaf_search_request_payload_size_bytes: new_histogram( - "leaf_search_request_payload_size_bytes", - "Size of the request payload sent to Lambda in bytes.", - "lambda", - payload_size_buckets(), - ), - leaf_search_response_payload_size_bytes: new_histogram( - "leaf_search_response_payload_size_bytes", - "Size of the response payload received from Lambda in bytes.", - "lambda", - payload_size_buckets(), - ), - } - } -} +pub(crate) static LEAF_SEARCH_DURATION_SECONDS: LazyLock = LazyLock::new(|| { + histogram!( + name: "leaf_search_duration_seconds", + description: "Duration of Lambda leaf search invocations in seconds.", + subsystem: "lambda", + buckets: duration_buckets(), + ) +}); + +pub(crate) static LEAF_SEARCH_REQUEST_PAYLOAD_SIZE_BYTES: LazyLock = + LazyLock::new(|| { + histogram!( + name: "leaf_search_request_payload_size_bytes", + description: "Size of the request payload sent to Lambda in bytes.", + subsystem: "lambda", + buckets: payload_size_buckets(), + ) + }); -pub static LAMBDA_METRICS: LazyLock = LazyLock::new(LambdaMetrics::default); +pub(crate) static LEAF_SEARCH_RESPONSE_PAYLOAD_SIZE_BYTES: LazyLock = + LazyLock::new(|| { + histogram!( + name: "leaf_search_response_payload_size_bytes", + description: "Size of the response payload received from Lambda in bytes.", + subsystem: "lambda", + buckets: payload_size_buckets(), + ) + }); diff --git a/quickwit/quickwit-metastore/Cargo.toml b/quickwit/quickwit-metastore/Cargo.toml index 8a8a4755feb..ac3e8d5406f 100644 --- a/quickwit/quickwit-metastore/Cargo.toml +++ b/quickwit/quickwit-metastore/Cargo.toml @@ -40,6 +40,7 @@ uuid = { workspace = true } utoipa = { workspace = true } quickwit-common = { workspace = true } +quickwit-metrics = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-parquet-engine = { workspace = true } diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs index 59cea1db805..b45d005366a 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metrics.rs @@ -14,39 +14,28 @@ use std::sync::LazyLock; -use quickwit_common::metrics::{IntGauge, new_gauge}; +use quickwit_metrics::{Gauge, gauge}; -#[derive(Clone)] -pub(super) struct PostgresMetrics { - pub acquire_connections: IntGauge, - pub active_connections: IntGauge, - pub idle_connections: IntGauge, -} +pub(super) static ACQUIRE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "acquire_connections", + description: "Number of connections being acquired.", + subsystem: "metastore", + ) +}); -impl Default for PostgresMetrics { - fn default() -> Self { - Self { - acquire_connections: new_gauge( - "acquire_connections", - "Number of connections being acquired.", - "metastore", - &[], - ), - active_connections: new_gauge( - "active_connections", - "Number of active (used + idle) connections.", - "metastore", - &[], - ), - idle_connections: new_gauge( - "idle_connections", - "Number of idle connections.", - "metastore", - &[], - ), - } - } -} +pub(super) static ACTIVE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "active_connections", + description: "Number of active (used + idle) connections.", + subsystem: "metastore", + ) +}); -pub(super) static POSTGRES_METRICS: LazyLock = - LazyLock::new(PostgresMetrics::default); +pub(super) static IDLE_CONNECTIONS: LazyLock = LazyLock::new(|| { + gauge!( + name: "idle_connections", + description: "Number of idle connections.", + subsystem: "metastore", + ) +}); diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs b/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs index a4c1e790e5b..963df072afa 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/pool.rs @@ -14,15 +14,13 @@ use futures::future::BoxFuture; use futures::stream::BoxStream; -use quickwit_common::metrics::GaugeGuard; +use quickwit_metrics::GaugeGuard; use sqlx::pool::PoolConnection; use sqlx::pool::maybe::MaybePoolConnection; use sqlx::{ Acquire, Database, Describe, Either, Error, Execute, Executor, Pool, Postgres, Transaction, }; -use super::metrics::POSTGRES_METRICS; - #[derive(Debug)] pub(super) struct TrackedPool { inner_pool: Pool, @@ -50,16 +48,12 @@ impl<'a, DB: Database> Acquire<'a> for &TrackedPool { fn acquire(self) -> BoxFuture<'static, Result> { let acquire_conn_fut = self.inner_pool.acquire(); - POSTGRES_METRICS - .active_connections - .set(self.inner_pool.size() as i64); - POSTGRES_METRICS - .idle_connections - .set(self.inner_pool.num_idle() as i64); + super::metrics::ACTIVE_CONNECTIONS.set(self.inner_pool.size() as f64); + super::metrics::IDLE_CONNECTIONS.set(self.inner_pool.num_idle() as f64); Box::pin(async move { - let mut gauge_guard = GaugeGuard::from_gauge(&POSTGRES_METRICS.acquire_connections); - gauge_guard.add(1); + let _gauge_guard = GaugeGuard::from_gauge(&super::metrics::ACQUIRE_CONNECTIONS); + _gauge_guard.increment(1.0); let conn = acquire_conn_fut.await?; Ok(conn) diff --git a/quickwit/quickwit-metrics-inventory/Cargo.toml b/quickwit/quickwit-metrics-inventory/Cargo.toml new file mode 100644 index 00000000000..504eaa8cec7 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "quickwit-metrics-inventory" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +description = "Enumerates all quickwit-metrics declarations across workspace crates" + +[dependencies] +quickwit-metrics = { workspace = true } diff --git a/quickwit/quickwit-metrics-inventory/build.rs b/quickwit/quickwit-metrics-inventory/build.rs new file mode 100644 index 00000000000..9659de6a894 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/build.rs @@ -0,0 +1,36 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap(); + // Prevent the linker from stripping inventory-submitted statics that + // live in dependency rlibs. Without this, the linker sees that the + // inventory binary never references symbols from dependency crates + // and drops them — along with the MetricInfo entries registered via + // inventory::submit!(). + match target_os.as_str() { + "macos" => { + println!("cargo::rustc-link-arg-bins=-Wl,-all_load"); + } + "linux" => { + println!("cargo::rustc-link-arg-bins=-Wl,--whole-archive"); + } + other => { + eprintln!( + "cargo:warning=quickwit-metrics-inventory: no whole-archive linker flag for \ + target OS '{other}'; inventory discovery from dependency crates may not work" + ); + } + } +} diff --git a/quickwit/quickwit-metrics-inventory/scripts/run.sh b/quickwit/quickwit-metrics-inventory/scripts/run.sh new file mode 100755 index 00000000000..253d7d51605 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/scripts/run.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# +# Discovers quickwit-metrics reverse dependencies, patches Cargo.toml and +# src/main.rs, builds and runs the inventory binary, then restores both +# files via git. Files are always restored — even on Ctrl-C or failure. +# +# Usage: +# ./scripts/run_inventory.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +CRATE_DIR="$(dirname "$SCRIPT_DIR")" +WORKSPACE_DIR="$(dirname "$CRATE_DIR")" +CARGO_TOML="$CRATE_DIR/Cargo.toml" +MAIN_RS="$CRATE_DIR/src/main.rs" + +trap 'git restore "$CARGO_TOML" "$MAIN_RS"' EXIT + +# --format '{lib}' outputs the Rust crate name (underscores, no version/path). +# --prefix none removes tree decorators. tail skips the root (quickwit-metrics itself). +REVERSE_DEPS=$(cargo tree --manifest-path "$WORKSPACE_DIR/Cargo.toml" \ + --workspace --all-features --depth 1 --invert quickwit-metrics \ + --prefix none --format '{lib}' 2>/dev/null \ + | tail -n +2) + +for rust_name in $REVERSE_DEPS; do + pkg_name=$(echo "$rust_name" | tr '_' '-') + echo "$pkg_name = { workspace = true }" >> "$CARGO_TOML" + echo "extern crate $rust_name;" >> "$MAIN_RS" +done + +cargo run --manifest-path "$CARGO_TOML" diff --git a/quickwit/quickwit-metrics-inventory/src/main.rs b/quickwit/quickwit-metrics-inventory/src/main.rs new file mode 100644 index 00000000000..75c8edfd5f5 --- /dev/null +++ b/quickwit/quickwit-metrics-inventory/src/main.rs @@ -0,0 +1,72 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Enumerates all registered `MetricInfo` entries via `inventory`. +//! +//! **Do not run this binary directly** — it will only see metrics from +//! crates listed in this crate's `Cargo.toml` dependencies. To discover +//! metrics from all workspace crates, use the wrapper script which patches +//! in reverse dependencies: +//! +//! ```sh +//! ./scripts/run_inventory.sh +//! ``` +//! +//! The script temporarily adds `extern crate` lines and `Cargo.toml` +//! dependencies for every crate that depends on `quickwit-metrics`, then +//! restores the files on exit. The `build.rs` ensures the linker pulls in +//! all inventory submissions even without explicit symbol references. + +use std::collections::BTreeMap; + +fn format_key(info: &quickwit_metrics::MetricInfo) -> String { + if info.static_labels.is_empty() { + info.key_name.to_string() + } else { + let pairs: Vec = info + .static_labels + .iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + format!("{}{{{}}}", info.key_name, pairs.join(", ")) + } +} + +fn main() { + let mut by_module: BTreeMap<&str, BTreeMap> = + BTreeMap::new(); + + for info in quickwit_metrics::metrics_info() { + let module = info.metadata.module_path().unwrap_or(""); + by_module + .entry(module) + .or_default() + .insert(format_key(info), info); + } + + for (module, metrics) in &by_module { + let max_key_len = metrics.keys().map(|k| k.len()).max().unwrap_or(0); + println!("{module}"); + for (key, info) in metrics { + println!( + " {key:, + _desc: metrics::SharedString, + ) { + } + fn describe_gauge( + &self, + _key: metrics::KeyName, + _unit: Option, + _desc: metrics::SharedString, + ) { + } + fn describe_histogram( + &self, + _key: metrics::KeyName, + _unit: Option, + _desc: metrics::SharedString, + ) { + } + fn register_counter(&self, _key: &metrics::Key, _metadata: &metrics::Metadata<'_>) -> Counter { + Counter::noop() + } + fn register_gauge(&self, _key: &metrics::Key, _metadata: &metrics::Metadata<'_>) -> Gauge { + Gauge::noop() + } + fn register_histogram( + &self, + _key: &metrics::Key, + _metadata: &metrics::Metadata<'_>, + ) -> Histogram { + Histogram::noop() + } +} + +// --------------------------------------------------------------------------- +// Recorder setup — RECORDER env-var is mandatory. +// +// RECORDER=noop cargo bench --bench baseline # noop recorder +// RECORDER=prometheus cargo bench --bench baseline # prometheus +// --------------------------------------------------------------------------- + +static INSTALL_RECORDER: OnceLock<()> = OnceLock::new(); + +fn install_recorder() { + INSTALL_RECORDER.get_or_init(|| { + let recorder = std::env::var("RECORDER") + .expect("RECORDER env var is required (set to \"noop\" or \"prometheus\")"); + + match recorder.to_ascii_lowercase().as_str() { + "noop" => { + eprintln!("[bench] Using noop recorder"); + metrics::set_global_recorder(NoopRecorder) + .expect("failed to install noop recorder"); + } + "prometheus" => { + eprintln!("[bench] Using prometheus recorder"); + let _handle = metrics_exporter_prometheus::PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder"); + } + other => { + panic!("unknown RECORDER value \"{other}\", expected \"noop\" or \"prometheus\"") + } + } + }); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn make_labels(n: usize) -> Vec