From 1063baac3bf9f3b0b28c2ed4543adf8281031111 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:16:52 +0300 Subject: [PATCH 1/6] Merge pull request #1414 from Altinity/frontport/antalya-26.1/rendezvous_hashing 26.1 Antalya port - improvements for cluster requests --- docs/en/sql-reference/statements/system.md | 6 + programs/server/Server.cpp | 4 + src/Access/Common/AccessType.h | 1 + src/Client/MultiplexedConnections.cpp | 7 +- src/Common/CurrentMetrics.cpp | 1 + src/Common/ProfileEvents.cpp | 5 + src/Core/Protocol.h | 2 + src/Core/Range.cpp | 41 +++ src/Core/Range.h | 5 + src/Core/Settings.cpp | 19 +- src/Core/SettingsChangesHistory.cpp | 24 ++ .../ObjectStorages/IObjectStorage.cpp | 56 ++++ .../ObjectStorages/IObjectStorage.h | 66 ++++- src/Interpreters/ClusterDiscovery.cpp | 62 ++++- src/Interpreters/ClusterDiscovery.h | 13 + src/Interpreters/ClusterFunctionReadTask.cpp | 14 +- src/Interpreters/ClusterFunctionReadTask.h | 2 + src/Interpreters/Context.cpp | 45 ++- src/Interpreters/Context.h | 11 + src/Interpreters/InterpreterSystemQuery.cpp | 20 ++ src/Parsers/ASTSystemQuery.cpp | 2 + src/Parsers/ASTSystemQuery.h | 2 + src/Processors/Chunk.cpp | 7 +- src/Processors/Sources/ConstChunkGenerator.h | 9 +- src/QueryPipeline/RemoteQueryExecutor.cpp | 22 ++ src/QueryPipeline/RemoteQueryExecutor.h | 21 +- .../RemoteQueryExecutorReadContext.cpp | 44 ++- .../RemoteQueryExecutorReadContext.h | 5 +- .../DataLakes/IDataLakeMetadata.cpp | 191 +++++++++++++ .../DataLakes/IDataLakeMetadata.h | 63 ++++- .../DataLakes/Iceberg/IcebergIterator.cpp | 5 + .../DataLakes/Iceberg/IcebergIterator.h | 3 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 4 +- .../DataLakes/Iceberg/ManifestFile.h | 24 +- .../StorageObjectStorageCluster.cpp | 68 ++++- .../StorageObjectStorageSource.cpp | 207 +++++++++++++- .../StorageObjectStorageSource.h | 14 +- ...rageObjectStorageStableTaskDistributor.cpp | 166 +++++++++-- ...torageObjectStorageStableTaskDistributor.h | 22 +- .../tests/gtest_rendezvous_hashing.cpp | 18 +- src/Storages/StorageFileCluster.cpp | 45 ++- src/Storages/StorageURLCluster.cpp | 40 ++- .../test_s3_cache_locality/__init__.py | 0 .../configs/cluster.xml | 126 +++++++++ .../configs/named_collections.xml | 10 + .../test_s3_cache_locality/configs/users.xml | 9 + .../test_s3_cache_locality/test.py | 262 ++++++++++++++++++ .../test_s3_cluster/data/graceful/part0.csv | 1 + .../test_s3_cluster/data/graceful/part1.csv | 1 + .../test_s3_cluster/data/graceful/part2.csv | 1 + .../test_s3_cluster/data/graceful/part3.csv | 1 + .../test_s3_cluster/data/graceful/part4.csv | 1 + .../test_s3_cluster/data/graceful/part5.csv | 1 + .../test_s3_cluster/data/graceful/part6.csv | 1 + .../test_s3_cluster/data/graceful/part7.csv | 1 + .../test_s3_cluster/data/graceful/part8.csv | 1 + .../test_s3_cluster/data/graceful/part9.csv | 1 + .../test_s3_cluster/data/graceful/partA.csv | 1 + .../test_s3_cluster/data/graceful/partB.csv | 1 + .../test_s3_cluster/data/graceful/partC.csv | 1 + .../test_s3_cluster/data/graceful/partD.csv | 1 + .../test_s3_cluster/data/graceful/partE.csv | 1 + .../test_s3_cluster/data/graceful/partF.csv | 1 + tests/integration/test_s3_cluster/test.py | 81 ++++++ ...test_read_constant_columns_optimization.py | 215 ++++++++++++++ ...ck_object_storage_task_distribution_ms.xml | 7 + tests/integration/test_storage_s3/test.py | 2 + .../01271_show_privileges.reference | 1 + ..._settings_cannot_be_enabled_by_default.sql | 6 +- 69 files changed, 2016 insertions(+), 105 deletions(-) create mode 100644 tests/integration/test_s3_cache_locality/__init__.py create mode 100644 tests/integration/test_s3_cache_locality/configs/cluster.xml create mode 100644 tests/integration/test_s3_cache_locality/configs/named_collections.xml create mode 100644 tests/integration/test_s3_cache_locality/configs/users.xml create mode 100644 tests/integration/test_s3_cache_locality/test.py create mode 100644 tests/integration/test_s3_cluster/data/graceful/part0.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part1.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part2.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part3.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part4.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part5.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part6.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part7.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part8.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/part9.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partA.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partB.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partC.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partD.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partE.csv create mode 100644 tests/integration/test_s3_cluster/data/graceful/partF.csv create mode 100644 tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py create mode 100644 tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 65521ba942f0..933bd4d32aca 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -224,6 +224,12 @@ Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$ Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) +## SYSTEM PRESHUTDOWN {#preshutdown} + + + +Prepare node for graceful shutdown. Unregister in autodiscovered clusters, stop accepting distributed requests to object storages (s3Cluster, icebergCluster, etc.). + ## SYSTEM INSTRUMENT {#instrument} Manages instrumentation points using LLVM's XRay feature which is available when ClickHouse is built using `ENABLE_XRAY=1`. diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 4e4ba58226cd..7b45a62f4834 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -2607,6 +2607,8 @@ try } + global_context->startSwarmMode(); + { std::lock_guard lock(servers_lock); /// We should start interserver communications before (and more important shutdown after) tables. @@ -3094,6 +3096,8 @@ try is_cancelled = true; + global_context->stopSwarmMode(); + LOG_DEBUG(log, "Waiting for current connections to close."); size_t current_connections = 0; diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 3ae01fb44fce..4ffbfebfbd47 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -352,6 +352,7 @@ enum class AccessType : uint8_t M(SYSTEM_TTL_MERGES, "SYSTEM STOP TTL MERGES, SYSTEM START TTL MERGES, STOP TTL MERGES, START TTL MERGES", TABLE, SYSTEM) \ M(SYSTEM_FETCHES, "SYSTEM STOP FETCHES, SYSTEM START FETCHES, STOP FETCHES, START FETCHES", TABLE, SYSTEM) \ M(SYSTEM_MOVES, "SYSTEM STOP MOVES, SYSTEM START MOVES, STOP MOVES, START MOVES", TABLE, SYSTEM) \ + M(SYSTEM_SWARM, "SYSTEM STOP SWARM MODE, SYSTEM START SWARM MODE, STOP SWARM MODE, START SWARM MODE", GLOBAL, SYSTEM) \ M(SYSTEM_PULLING_REPLICATION_LOG, "SYSTEM STOP PULLING REPLICATION LOG, SYSTEM START PULLING REPLICATION LOG", TABLE, SYSTEM) \ M(SYSTEM_CLEANUP, "SYSTEM STOP CLEANUP, SYSTEM START CLEANUP", TABLE, SYSTEM) \ M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, CANCEL VIEW", VIEW, SYSTEM) \ diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index c4c5010df342..3b1a09631dd9 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -232,7 +232,7 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector & uuid void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFunctionReadTaskResponse & response) { std::lock_guard lock(cancel_mutex); - if (cancelled) + if (cancelled || !current_connection || !current_connection->isConnected()) return; current_connection->sendClusterFunctionReadTaskResponse(response); } @@ -241,7 +241,7 @@ void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFu void MultiplexedConnections::sendMergeTreeReadTaskResponse(const ParallelReadResponse & response) { std::lock_guard lock(cancel_mutex); - if (cancelled) + if (cancelled || !current_connection || !current_connection->isConnected()) return; current_connection->sendMergeTreeReadTaskResponse(response); } @@ -527,9 +527,12 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead void MultiplexedConnections::invalidateReplica(ReplicaState & state) { + Connection * old_connection = state.connection; state.connection = nullptr; state.pool_entry = IConnectionPool::Entry(); --active_connection_count; + if (current_connection == old_connection) + current_connection = nullptr; } void MultiplexedConnections::setAsyncCallback(AsyncCallback async_callback) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 84d3b3c5bd0e..f398128cfebf 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -483,6 +483,7 @@ M(StartupScriptsExecutionState, "State of startup scripts execution: 0 = not finished, 1 = success, 2 = failure.") \ \ M(IsServerShuttingDown, "Indicates if the server is shutting down: 0 = no, 1 = yes") \ + M(IsSwarmModeEnabled, "Indicates if the swarm mode enabled or not: 0 = disabled, 1 = enabled") \ \ M(StatelessWorkerThreads, "Number of threads in the stateless worker thread pool.") \ M(StatelessWorkerThreadsActive, "Number of threads in the stateless worker thread pool running a task.") \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 9872e039abfa..87368fd76272 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -1377,6 +1377,11 @@ The server successfully detected this situation and will download merged part fr M(RuntimeFilterRowsChecked, "Number of rows checked by JOIN Runtime Filters", ValueType::Number) \ M(RuntimeFilterRowsPassed, "Number of rows that passed (not filtered out by) JOIN Runtime Filters", ValueType::Number) \ M(RuntimeFilterRowsSkipped, "Number of rows in blocks that were skipped by JOIN Runtime Filters", ValueType::Number) \ + \ + M(ObjectStorageClusterSentToMatchedReplica, "Number of tasks in ObjectStorageCluster request sent to matched replica.", ValueType::Number) \ + M(ObjectStorageClusterSentToNonMatchedReplica, "Number of tasks in ObjectStorageCluster request sent to non-matched replica.", ValueType::Number) \ + M(ObjectStorageClusterProcessedTasks, "Number of processed tasks in ObjectStorageCluster request.", ValueType::Number) \ + M(ObjectStorageClusterWaitingMicroseconds, "Time of waiting for tasks in ObjectStorageCluster request.", ValueType::Microseconds) \ #ifdef APPLY_FOR_EXTERNAL_EVENTS diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h index d8bc5be252e9..dcb83cd33a80 100644 --- a/src/Core/Protocol.h +++ b/src/Core/Protocol.h @@ -96,8 +96,10 @@ namespace Protocol MergeTreeReadTaskRequest = 16, /// Request from a MergeTree replica to a coordinator TimezoneUpdate = 17, /// Receive server's (session-wide) default timezone SSHChallenge = 18, /// Return challenge for SSH signature signing + MAX = SSHChallenge, + ConnectionLost = 255, /// Exception that occurred on the client side. }; /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10 diff --git a/src/Core/Range.cpp b/src/Core/Range.cpp index 139fb8db76c9..766c2d4719c8 100644 --- a/src/Core/Range.cpp +++ b/src/Core/Range.cpp @@ -2,13 +2,21 @@ #include #include #include +#include #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +}; + + FieldRef::FieldRef(ColumnsWithTypeAndName * columns_, size_t row_idx_, size_t column_idx_) : Field((*(*columns_)[column_idx_].column)[row_idx_]), columns(columns_), row_idx(row_idx_), column_idx(column_idx_) { @@ -151,6 +159,13 @@ bool Range::isInfinite() const return left.isNegativeInfinity() && right.isPositiveInfinity(); } +/// [x, x] +bool Range::isPoint() const +{ + return fullBounded() && left_included && right_included && equals(left, right) + && !left.isNegativeInfinity() && !left.isPositiveInfinity(); +} + bool Range::intersectsRange(const Range & r) const { /// r to the left of me. @@ -276,6 +291,32 @@ bool Range::nearByWith(const Range & r) const return false; } +String Range::serialize(bool base64) const +{ + WriteBufferFromOwnString str; + + str << left_included << right_included; + writeFieldBinary(left, str); + writeFieldBinary(right, str); + + if (base64) + return base64Encode(str.str()); + else + return str.str(); +} + +void Range::deserialize(const String & range, bool base64) +{ + if (range.empty()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Empty range dump"); + + ReadBufferFromOwnString str(base64 ? base64Decode(range) : range); + + str >> left_included >> right_included; + left = readFieldBinary(str); + right = readFieldBinary(str); +} + Range intersect(const Range & a, const Range & b) { Range res = Range::createWholeUniverse(); diff --git a/src/Core/Range.h b/src/Core/Range.h index 6072795db0a9..059b7470d1d7 100644 --- a/src/Core/Range.h +++ b/src/Core/Range.h @@ -94,6 +94,8 @@ struct Range bool isBlank() const; + bool isPoint() const; + bool intersectsRange(const Range & r) const; bool containsRange(const Range & r) const; @@ -114,6 +116,9 @@ struct Range bool nearByWith(const Range & r) const; String toString() const; + + String serialize(bool base64 = false) const; + void deserialize(const String & range, bool base64 = false); }; Range intersect(const Range & a, const Range & b); diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 0cbb8921adc8..f239be8a09c0 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7665,6 +7665,19 @@ Default number of tasks for parallel reading in distributed query. Tasks are spr DECLARE(Bool, distributed_plan_optimize_exchanges, true, R"( Removes unnecessary exchanges in distributed query plan. Disable it for debugging. )", 0) \ + DECLARE(UInt64, lock_object_storage_task_distribution_ms, 500, R"( +In object storage distribution queries do not distribute tasks on non-prefetched nodes until prefetched node is active. +Determines how long the free executor node (one that finished processing all of it assigned tasks) should wait before "stealing" tasks from queue of currently busy executor nodes. + +Possible values: + +- 0 - steal tasks immediately after freeing up. +- >0 - wait for specified period of time before stealing tasks. + +Having this `>0` helps with cache reuse and might improve overall query time. +Because busy node might have warmed-up caches for this specific task, while free node needs to fetch lots of data from S3. +Which might take longer than just waiting for the busy node and generate extra traffic. +)", EXPERIMENTAL) \ DECLARE(String, distributed_plan_force_exchange_kind, "", R"( Force specified kind of Exchange operators between distributed query stages. @@ -7712,6 +7725,9 @@ If the number of set bits in a runtime bloom filter exceeds this ratio the filte )", EXPERIMENTAL) \ DECLARE(Bool, rewrite_in_to_join, false, R"( Rewrite expressions like 'x IN subquery' to JOIN. This might be useful for optimizing the whole query with join reordering. +)", EXPERIMENTAL) \ + DECLARE(Bool, allow_experimental_iceberg_read_optimization, true, R"( +Allow Iceberg read optimization based on Iceberg metadata. )", EXPERIMENTAL) \ \ /** Experimental timeSeries* aggregate functions. */ \ @@ -7880,7 +7896,8 @@ Maximum number of WebAssembly UDF instances that can run in parallel per functio MAKE_OBSOLETE(M, Bool, describe_extend_object_types, false) \ MAKE_OBSOLETE(M, Bool, allow_experimental_object_type, false) \ MAKE_OBSOLETE(M, BoolAuto, insert_select_deduplicate, Field{"auto"}) \ - MAKE_OBSOLETE(M, Bool, use_text_index_dictionary_cache, false) + MAKE_OBSOLETE(M, Bool, use_text_index_dictionary_cache, false) \ + MAKE_OBSOLETE(M, Bool, allow_retries_in_cluster_requests, false) /** The section above is for obsolete settings. Do not add anything there. */ #endif /// __CLION_IDE__ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index ac6d42e957f6..776393441e03 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -329,6 +329,30 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() // {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."}, // {"object_storage_cluster", "", "", "Antalya: New setting"}, // {"object_storage_max_nodes", 0, 0, "Antalya: New setting"}, + {"allow_experimental_iceberg_read_optimization", true, true, "New setting."}, + {"object_storage_cluster_join_mode", "allow", "allow", "New setting"}, + {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."}, + {"allow_retries_in_cluster_requests", false, false, "New setting"}, + // {"object_storage_remote_initiator", false, false, "New setting."}, + {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."}, + {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."}, + {"export_merge_tree_partition_force_export", false, false, "New setting."}, + {"export_merge_tree_partition_max_retries", 3, 3, "New setting."}, + {"export_merge_tree_partition_manifest_ttl", 180, 180, "New setting."}, + {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."}, + // {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, + {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."}, + {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."}, + {"enable_alias_marker", true, true, "New setting."}, + // {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"}, + // {"input_format_parquet_verify_checksums", true, true, "New setting."}, + // {"output_format_parquet_write_checksums", false, true, "New setting."}, + {"export_merge_tree_part_max_bytes_per_file", 0, 0, "New setting."}, + {"export_merge_tree_part_max_rows_per_file", 0, 0, "New setting."}, + // {"cluster_table_function_split_granularity", "file", "file", "New setting."}, + // {"cluster_table_function_buckets_batch_size", 0, 0, "New setting."}, + {"export_merge_tree_part_throw_on_pending_mutations", true, true, "New setting."}, + {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."}, }); addSettingsChanges(settings_changes_history, "25.8", { diff --git a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp index 5b2225775247..6c825fd9c21b 100644 --- a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp @@ -8,6 +8,11 @@ #include #include #include +#include + +#include +#include +#include namespace DB @@ -102,4 +107,55 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings return write_settings; } +RelativePathWithMetadata::RelativePathWithMetadata(const DataFileInfo & info, std::optional metadata_) + : metadata(std::move(metadata_)) +{ + relative_path = info.file_path; + file_meta_info = info.file_meta_info; +} + +RelativePathWithMetadata::CommandInTaskResponse::CommandInTaskResponse(const std::string & task) +{ + Poco::JSON::Parser parser; + try + { + auto json = parser.parse(task).extract(); + if (!json) + return; + + is_valid = true; + + if (json->has("file_path")) + file_path = json->getValue("file_path"); + if (json->has("retry_after_us")) + retry_after_us = json->getValue("retry_after_us"); + if (json->has("meta_info")) + file_meta_info = std::make_shared(json->getObject("meta_info")); + } + catch (const Poco::JSON::JSONException &) + { /// Not a JSON + return; + } + catch (const Poco::SyntaxException &) + { /// Not a JSON + return; + } +} + +std::string RelativePathWithMetadata::CommandInTaskResponse::toString() const +{ + Poco::JSON::Object json; + if (file_path.has_value()) + json.set("file_path", file_path.value()); + if (retry_after_us.has_value()) + json.set("retry_after_us", retry_after_us.value()); + if (file_meta_info.has_value()) + json.set("meta_info", file_meta_info.value()->toJson()); + + std::ostringstream oss; + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); +} + } diff --git a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h index 6610ee6874f9..a6a7bc6696f6 100644 --- a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h +++ b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h @@ -110,20 +110,74 @@ struct ObjectMetadata ObjectAttributes attributes; }; + +struct DataFileInfo; +class DataFileMetaInfo; +using DataFileMetaInfoPtr = std::shared_ptr; + struct DataLakeObjectMetadata; struct RelativePathWithMetadata { + class CommandInTaskResponse + { + public: + CommandInTaskResponse() = default; + explicit CommandInTaskResponse(const std::string & task); + + bool isValid() const { return is_valid; } + void setFilePath(const std::string & file_path_ ) + { + file_path = file_path_; + is_valid = true; + } + void setRetryAfterUs(Poco::Timestamp::TimeDiff time_us) + { + retry_after_us = time_us; + is_valid = true; + } + void setFileMetaInfo(DataFileMetaInfoPtr file_meta_info_ ) + { + file_meta_info = file_meta_info_; + is_valid = true; + } + + std::string toString() const; + + std::optional getFilePath() const { return file_path; } + std::optional getRetryAfterUs() const { return retry_after_us; } + std::optional getFileMetaInfo() const { return file_meta_info; } + + private: + bool is_valid = false; + std::optional file_path; + std::optional retry_after_us; + std::optional file_meta_info; + }; + String relative_path; /// Object metadata: size, modification time, etc. std::optional metadata; + /// Information about columns + std::optional file_meta_info; + /// Retry request after short pause + CommandInTaskResponse command; RelativePathWithMetadata() = default; - explicit RelativePathWithMetadata(String relative_path_, std::optional metadata_ = std::nullopt) - : relative_path(std::move(relative_path_)) + explicit RelativePathWithMetadata(String command_or_path, std::optional metadata_ = std::nullopt) + : relative_path(std::move(command_or_path)) , metadata(std::move(metadata_)) - {} + , command(relative_path) + { + if (command.isValid()) + { + relative_path = command.getFilePath().value_or(""); + file_meta_info = command.getFileMetaInfo(); + } + } + + explicit RelativePathWithMetadata(const DataFileInfo & info, std::optional metadata_ = std::nullopt); RelativePathWithMetadata(const RelativePathWithMetadata & other) = default; @@ -131,6 +185,12 @@ struct RelativePathWithMetadata std::string getFileName() const { return std::filesystem::path(relative_path).filename(); } std::string getPath() const { return relative_path; } + + void setFileMetaInfo(std::optional file_meta_info_ ) { file_meta_info = file_meta_info_; } + std::optional getFileMetaInfo() const { return file_meta_info; } + + const CommandInTaskResponse & getCommand() const { return command; } + std::string getFileNameWithoutExtension() const { return std::filesystem::path(relative_path).stem(); } }; struct ObjectKeyWithMetadata diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 98f57c31e9e5..844a60a4fba2 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -152,6 +152,13 @@ class ClusterDiscovery::Flags cv.notify_one(); } + void wakeup() + { + std::unique_lock lk(mu); + any_need_update = true; + cv.notify_one(); + } + private: std::condition_variable cv; std::mutex mu; @@ -441,7 +448,9 @@ bool ClusterDiscovery::upsertCluster(ClusterInfo & cluster_info) return true; }; - if (!cluster_info.current_node_is_observer && !contains(node_uuids, current_node_name)) + if (!cluster_info.current_node_is_observer + && context->isSwarmModeEnabled() + && !contains(node_uuids, current_node_name)) { LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name); registerInZk(zk, cluster_info); @@ -506,12 +515,30 @@ void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & inf return; } + if (!context->isSwarmModeEnabled()) + { + LOG_DEBUG(log, "STOP SWARM MODE called, skip self-registering current node {} in cluster {}", current_node_name, info.name); + return; + } + LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name); zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral); LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name); } +void ClusterDiscovery::unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info) +{ + if (info.current_node_is_observer) + return; + + String node_path = getShardsListPath(info.zk_root) / current_node_name; + LOG_DEBUG(log, "Removing current node {} from cluster {}", current_node_name, info.name); + + zk->remove(node_path); + LOG_DEBUG(log, "Current node {} removed from cluster {}", current_node_name, info.name); +} + void ClusterDiscovery::initialUpdate() { LOG_DEBUG(log, "Initializing"); @@ -554,6 +581,18 @@ void ClusterDiscovery::initialUpdate() is_initialized = true; } +void ClusterDiscovery::registerAll() +{ + register_change_flag = RegisterChangeFlag::RCF_REGISTER_ALL; + clusters_to_update->wakeup(); +} + +void ClusterDiscovery::unregisterAll() +{ + register_change_flag = RegisterChangeFlag::RCF_UNREGISTER_ALL; + clusters_to_update->wakeup(); +} + void ClusterDiscovery::findDynamicClusters( std::unordered_map & info, std::unordered_set * unchanged_roots) @@ -782,6 +821,27 @@ bool ClusterDiscovery::runMainThread(std::function up_to_date_callback) { up_to_date_callback(); } + + RegisterChangeFlag flag = register_change_flag.exchange(RegisterChangeFlag::RCF_NONE); + + if (flag == RegisterChangeFlag::RCF_REGISTER_ALL) + { + LOG_DEBUG(log, "Register in all dynamic clusters"); + for (auto & [_, info] : clusters_info) + { + auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name); + registerInZk(zk, info); + } + } + else if (flag == RegisterChangeFlag::RCF_UNREGISTER_ALL) + { + LOG_DEBUG(log, "Unregister in all dynamic clusters"); + for (auto & [_, info] : clusters_info) + { + auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name); + unregisterFromZk(zk, info); + } + } } LOG_DEBUG(log, "Worker thread stopped"); return finished; diff --git a/src/Interpreters/ClusterDiscovery.h b/src/Interpreters/ClusterDiscovery.h index 963970b32af2..eee0c157e465 100644 --- a/src/Interpreters/ClusterDiscovery.h +++ b/src/Interpreters/ClusterDiscovery.h @@ -38,6 +38,9 @@ class ClusterDiscovery ~ClusterDiscovery(); + void registerAll(); + void unregisterAll(); + private: struct NodeInfo { @@ -112,6 +115,7 @@ class ClusterDiscovery void initialUpdate(); void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); + void unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); Strings getNodeNames(zkutil::ZooKeeperPtr & zk, const String & zk_root, @@ -194,6 +198,15 @@ class ClusterDiscovery std::vector multicluster_discovery_paths; MultiVersion::Version macros; + + enum RegisterChangeFlag + { + RCF_NONE, + RCF_REGISTER_ALL, + RCF_UNREGISTER_ALL, + }; + + std::atomic register_change_flag = RegisterChangeFlag::RCF_NONE; }; } diff --git a/src/Interpreters/ClusterFunctionReadTask.cpp b/src/Interpreters/ClusterFunctionReadTask.cpp index 6eda3481a78f..fdacbda9270b 100644 --- a/src/Interpreters/ClusterFunctionReadTask.cpp +++ b/src/Interpreters/ClusterFunctionReadTask.cpp @@ -41,8 +41,16 @@ ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(ObjectInfoPtr o } #endif - const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]; - path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath(); + file_meta_info = object->relative_path_with_metadata.file_meta_info; + + if (object->relative_path_with_metadata.getCommand().isValid()) + path = object->relative_path_with_metadata.getCommand().toString(); + else + { + const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]; + path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath(); + } + file_bucket_info = object->file_bucket_info; } @@ -74,6 +82,8 @@ ObjectInfoPtr ClusterFunctionReadTaskResponse::getObjectInfo() const } object->data_lake_metadata = data_lake_metadata; object->file_bucket_info = file_bucket_info; + if (file_meta_info.has_value()) + object->relative_path_with_metadata.file_meta_info = file_meta_info; return object; } diff --git a/src/Interpreters/ClusterFunctionReadTask.h b/src/Interpreters/ClusterFunctionReadTask.h index d94133d252b5..c41fa8293e75 100644 --- a/src/Interpreters/ClusterFunctionReadTask.h +++ b/src/Interpreters/ClusterFunctionReadTask.h @@ -25,6 +25,8 @@ struct ClusterFunctionReadTaskResponse DataLakeObjectMetadata data_lake_metadata; /// Iceberg object metadata std::optional iceberg_info; + /// File's columns info + std::optional file_meta_info; /// Convert received response into ObjectInfo. ObjectInfoPtr getObjectInfo() const; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 92ae95d7e9c9..2304133f54f8 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -262,6 +262,7 @@ namespace CurrentMetrics extern const Metric IndexUncompressedCacheCells; extern const Metric ZooKeeperSessionExpired; extern const Metric ZooKeeperConnectionLossStartedTimestampSeconds; + extern const Metric IsSwarmModeEnabled; } @@ -696,6 +697,7 @@ struct ContextSharedPart : boost::noncopyable std::map server_ports; std::atomic shutdown_called = false; + std::atomic swarm_mode_enabled = true; Stopwatch uptime_watch TSA_GUARDED_BY(mutex); @@ -867,6 +869,8 @@ struct ContextSharedPart : boost::noncopyable */ void shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS { + swarm_mode_enabled = false; + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0); bool is_shutdown_called = shutdown_called.exchange(true); if (is_shutdown_called) return; @@ -5565,7 +5569,6 @@ std::shared_ptr Context::getCluster(const std::string & cluster_name) c throw Exception(ErrorCodes::CLUSTER_DOESNT_EXIST, "Requested cluster '{}' not found", cluster_name); } - std::shared_ptr Context::tryGetCluster(const std::string & cluster_name) const { std::shared_ptr res = nullptr; @@ -5584,6 +5587,21 @@ std::shared_ptr Context::tryGetCluster(const std::string & cluster_name return res; } +void Context::unregisterInAutodiscoveryClusters() +{ + std::lock_guard lock(shared->clusters_mutex); + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->unregisterAll(); +} + +void Context::registerInAutodiscoveryClusters() +{ + std::lock_guard lock(shared->clusters_mutex); + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->registerAll(); +} void Context::reloadClusterConfig() const { @@ -6563,12 +6581,35 @@ void Context::stopServers(const ServerType & server_type) const shared->stop_servers_callback(server_type); } - void Context::shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS { shared->shutdown(); } +bool Context::stopSwarmMode() +{ + bool expected_is_enabled = true; + bool is_stopped_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, false); + if (is_stopped_now) + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0); + // return true if stop successful + return is_stopped_now; +} + +bool Context::startSwarmMode() +{ + bool expected_is_enabled = false; + bool is_started_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, true); + if (is_started_now) + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 1); + // return true if start successful + return is_started_now; +} + +bool Context::isSwarmModeEnabled() const +{ + return shared->swarm_mode_enabled; +} Context::ApplicationType Context::getApplicationType() const { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index b10d3a3f2ce4..74730f22107e 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -1476,6 +1476,8 @@ class Context: public ContextData, public std::enable_shared_from_this size_t getClustersVersion() const; void startClusterDiscovery(); + void registerInAutodiscoveryClusters(); + void unregisterInAutodiscoveryClusters(); /// Sets custom cluster, but doesn't update configuration void setCluster(const String & cluster_name, const std::shared_ptr & cluster); @@ -1595,6 +1597,15 @@ class Context: public ContextData, public std::enable_shared_from_this void shutdown(); + /// Stop some works to allow graceful shutdown later. + /// Returns true if stop successful. + bool stopSwarmMode(); + /// Resume some works if we change our mind. + /// Returns true if start successful. + bool startSwarmMode(); + /// Return current swarm mode state. + bool isSwarmModeEnabled() const; + bool isInternalQuery() const { return is_internal_query; } void setInternalQuery(bool internal) { is_internal_query = internal; } diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 8602cf1c36be..c5058e1879f8 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -785,6 +785,20 @@ BlockIO InterpreterSystemQuery::execute() case Type::START_MOVES: startStopAction(ActionLocks::PartsMove, true); break; + case Type::STOP_SWARM_MODE: + { + getContext()->checkAccess(AccessType::SYSTEM_SWARM); + if (getContext()->stopSwarmMode()) + getContext()->unregisterInAutodiscoveryClusters(); + break; + } + case Type::START_SWARM_MODE: + { + getContext()->checkAccess(AccessType::SYSTEM_SWARM); + if (getContext()->startSwarmMode()) + getContext()->registerInAutodiscoveryClusters(); + break; + } case Type::STOP_FETCHES: startStopAction(ActionLocks::PartsFetch, false); break; @@ -2255,6 +2269,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() required_access.emplace_back(AccessType::SYSTEM_MOVES, query.getDatabase(), query.getTable()); break; } + case Type::STOP_SWARM_MODE: + case Type::START_SWARM_MODE: + { + required_access.emplace_back(AccessType::SYSTEM_SWARM); + break; + } case Type::STOP_PULLING_REPLICATION_LOG: case Type::START_PULLING_REPLICATION_LOG: { diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index 5fd99ca62aa5..49c6611b7b5e 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -627,6 +627,8 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti case Type::RECONNECT_ZOOKEEPER: case Type::FREE_MEMORY: case Type::RESET_DDL_WORKER: + case Type::STOP_SWARM_MODE: + case Type::START_SWARM_MODE: break; case Type::UNKNOWN: case Type::END: diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 8a062cbf1646..3d4b087207d1 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -91,6 +91,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster START_FETCHES, STOP_MOVES, START_MOVES, + STOP_SWARM_MODE, + START_SWARM_MODE, STOP_REPLICATED_SENDS, START_REPLICATED_SENDS, STOP_REPLICATION_QUEUES, diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp index 0564131c8768..17a095db7780 100644 --- a/src/Processors/Chunk.cpp +++ b/src/Processors/Chunk.cpp @@ -108,7 +108,12 @@ void Chunk::addColumn(ColumnPtr column) void Chunk::addColumn(size_t position, ColumnPtr column) { - if (position >= columns.size()) + if (position == columns.size()) + { + addColumn(column); + return; + } + if (position > columns.size()) throw Exception(ErrorCodes::POSITION_OUT_OF_BOUND, "Position {} out of bound in Chunk::addColumn(), max position = {}", position, !columns.empty() ? columns.size() - 1 : 0); diff --git a/src/Processors/Sources/ConstChunkGenerator.h b/src/Processors/Sources/ConstChunkGenerator.h index 8232bd5f0dff..ca120fc23d23 100644 --- a/src/Processors/Sources/ConstChunkGenerator.h +++ b/src/Processors/Sources/ConstChunkGenerator.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -13,7 +14,7 @@ class ConstChunkGenerator : public ISource public: ConstChunkGenerator(SharedHeader header, size_t total_num_rows, size_t max_block_size_) : ISource(std::move(header)) - , remaining_rows(total_num_rows), max_block_size(max_block_size_) + , generated_rows(0), remaining_rows(total_num_rows), max_block_size(max_block_size_) { } @@ -27,10 +28,14 @@ class ConstChunkGenerator : public ISource size_t num_rows = std::min(max_block_size, remaining_rows); remaining_rows -= num_rows; - return cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows); + auto chunk = cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows); + chunk.getChunkInfos().add(std::make_shared(generated_rows)); + generated_rows += num_rows; + return chunk; } private: + size_t generated_rows; size_t remaining_rows; size_t max_block_size; }; diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index 3ff9ef0a9827..7c4bf61113e9 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -662,7 +662,11 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet /// We can actually return it, and the first call to RemoteQueryExecutor::read /// will return earlier. We should consider doing it. if (!packet.block.empty() && (packet.block.rows() > 0)) + { + if (extension && extension->replica_info) + replica_has_processed_data.insert(extension->replica_info->number_of_current_replica); return ReadResult(adaptBlockStructure(packet.block, *header)); + } break; /// If the block is empty - we will receive other packets before EndOfStream. case Protocol::Server::Exception: @@ -724,6 +728,19 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet case Protocol::Server::TimezoneUpdate: break; + case Protocol::Server::ConnectionLost: + if (extension && extension->task_iterator && extension->task_iterator->supportRerunTask() && extension->replica_info) + { + if (!replica_has_processed_data.contains(extension->replica_info->number_of_current_replica)) + { + finished = true; + extension->task_iterator->rescheduleTasksFromReplica(extension->replica_info->number_of_current_replica); + return ReadResult(Block{}); + } + } + packet.exception->rethrow(); + break; + default: got_unknown_packet_from_replica = true; throw Exception( @@ -1005,6 +1022,11 @@ void RemoteQueryExecutor::setProfileInfoCallback(ProfileInfoCallback callback) profile_info_callback = std::move(callback); } +bool RemoteQueryExecutor::skipUnavailableShards() const +{ + return context->getSettingsRef()[Setting::skip_unavailable_shards]; +} + bool RemoteQueryExecutor::needToSkipUnavailableShard() { if (context->getSettingsRef()[Setting::skip_unavailable_shards] && (0 == connections->size())) diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index e57aad64813b..8c5b1144cf82 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -34,7 +34,22 @@ class RemoteQueryExecutorReadContext; class ParallelReplicasReadingCoordinator; -using TaskIterator = std::function; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +}; + +class TaskIterator +{ +public: + virtual ~TaskIterator() = default; + virtual bool supportRerunTask() const { return false; } + virtual void rescheduleTasksFromReplica(size_t /* number_of_current_replica */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method rescheduleTasksFromReplica is not implemented"); + } + virtual ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const = 0; +}; /// This class allows one to launch queries on remote replicas of one shard and get results class RemoteQueryExecutor @@ -223,6 +238,8 @@ class RemoteQueryExecutor IConnections & getConnections() { return *connections; } + bool skipUnavailableShards() const; + bool needToSkipUnavailableShard(); bool isReplicaUnavailable() const { return extension && extension->parallel_reading_coordinator && connections->size() == 0; } @@ -329,6 +346,8 @@ class RemoteQueryExecutor const bool read_packet_type_separately = false; + std::unordered_set replica_has_processed_data; + /// Send all scalars to remote servers void sendScalars(); diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp index f0598ac89345..55340cfc59aa 100644 --- a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp +++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp @@ -17,10 +17,13 @@ namespace ErrorCodes extern const int CANNOT_READ_FROM_SOCKET; extern const int CANNOT_OPEN_FILE; extern const int SOCKET_TIMEOUT; + extern const int ATTEMPT_TO_READ_AFTER_EOF; } RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext( - RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_) + RemoteQueryExecutor & executor_, + bool suspend_when_query_sent_, + bool read_packet_type_separately_) : AsyncTaskExecutor(std::make_unique(*this)) , executor(executor_) , suspend_when_query_sent(suspend_when_query_sent_) @@ -55,19 +58,42 @@ void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, Sus if (read_context.executor.needToSkipUnavailableShard()) return; - while (true) + try { - read_context.has_read_packet_part = PacketPart::None; + while (true) + { + read_context.has_read_packet_part = PacketPart::None; + + if (read_context.read_packet_type_separately) + { + read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); + read_context.has_read_packet_part = PacketPart::Type; + suspend_callback(); + } + read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); + read_context.has_read_packet_part = PacketPart::Body; + if (read_context.packet.type == Protocol::Server::Data && read_context.packet.block.rows() > 0) + read_context.has_data_packets = true; - if (read_context.read_packet_type_separately) + suspend_callback(); + } + } + catch (const Exception & e) + { + /// If cluster node unxepectedly shutted down (kill/segfault/power off/etc.) socket just closes. + /// If initiator did not process any data packets before, this fact can be ignored. + /// Unprocessed tasks will be executed on other nodes. + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF + && !read_context.has_data_packets.load() + && read_context.executor.skipUnavailableShards()) { - read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); - read_context.has_read_packet_part = PacketPart::Type; + read_context.packet.type = Protocol::Server::ConnectionLost; + read_context.packet.exception = std::make_unique(getCurrentExceptionMessageAndPattern(true), getCurrentExceptionCode()); + read_context.has_read_packet_part = PacketPart::Body; suspend_callback(); } - read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); - read_context.has_read_packet_part = PacketPart::Body; - suspend_callback(); + else + throw; } } diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.h b/src/QueryPipeline/RemoteQueryExecutorReadContext.h index abde6cb93ef3..16ff5dc67a69 100644 --- a/src/QueryPipeline/RemoteQueryExecutorReadContext.h +++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.h @@ -26,7 +26,9 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor { public: explicit RemoteQueryExecutorReadContext( - RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_); + RemoteQueryExecutor & executor_, + bool suspend_when_query_sent_, + bool read_packet_type_separately_); ~RemoteQueryExecutorReadContext() override; @@ -85,6 +87,7 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor /// None -> Type -> Body -> None /// None -> Body -> None std::atomic has_read_packet_part = PacketPart::None; + std::atomic_bool has_data_packets = false; Packet packet; RemoteQueryExecutor & executor; diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index 6eed86d81e7d..c0fb681f9595 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -1,9 +1,19 @@ #include #include +#include +#include +#include +#include +#include namespace DB { +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +}; + namespace { @@ -87,4 +97,185 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat( return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, context, supports_subset_of_columns, supports_tuple_elements); } +DataFileMetaInfo::DataFileMetaInfo( + const Iceberg::IcebergSchemaProcessor & schema_processor, + Int32 schema_id, + const std::unordered_map & columns_info_) +{ + + std::vector column_ids; + for (const auto & column : columns_info_) + column_ids.push_back(column.first); + + auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids); + std::unordered_map name_by_index; + for (const auto & name_and_type : name_and_types) + { + const auto name = name_and_type.getNameInStorage(); + auto index = schema_processor.tryGetColumnIDByName(schema_id, name); + if (index.has_value()) + name_by_index[index.value()] = name; + } + + for (const auto & column : columns_info_) + { + auto i_name = name_by_index.find(column.first); + if (i_name != name_by_index.end()) + { + columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, column.second.hyperrectangle}; + } + } +} + +DataFileMetaInfo::DataFileMetaInfo(Poco::JSON::Object::Ptr file_info) +{ + if (!file_info) + return; + + auto log = getLogger("DataFileMetaInfo"); + + if (file_info->has("columns")) + { + auto columns = file_info->getArray("columns"); + for (size_t i = 0; i < columns->size(); ++i) + { + auto column = columns->getObject(static_cast(i)); + + std::string name; + if (column->has("name")) + name = column->get("name").toString(); + else + { + LOG_WARNING(log, "Can't read column name, ignored"); + continue; + } + + DB::DataFileMetaInfo::ColumnInfo column_info; + if (column->has("rows")) + column_info.rows_count = column->get("rows"); + if (column->has("nulls")) + column_info.nulls_count = column->get("nulls"); + if (column->has("range")) + { + Range range(""); + std::string r = column->get("range"); + try + { + range.deserialize(r, /*base64*/ true); + column_info.hyperrectangle = std::move(range); + } + catch (const Exception & e) + { + LOG_WARNING(log, "Can't read range for column {}, range '{}' ignored, error: {}", name, r, e.what()); + } + } + + columns_info[name] = column_info; + } + } +} + +Poco::JSON::Object::Ptr DataFileMetaInfo::toJson() const +{ + Poco::JSON::Object::Ptr file_info = new Poco::JSON::Object(); + + if (!columns_info.empty()) + { + Poco::JSON::Array::Ptr columns = new Poco::JSON::Array(); + + for (const auto & column : columns_info) + { + Poco::JSON::Object::Ptr column_info = new Poco::JSON::Object(); + column_info->set("name", column.first); + if (column.second.rows_count.has_value()) + column_info->set("rows", column.second.rows_count.value()); + if (column.second.nulls_count.has_value()) + column_info->set("nulls", column.second.nulls_count.value()); + if (column.second.hyperrectangle.has_value()) + column_info->set("range", column.second.hyperrectangle.value().serialize(/*base64*/ true)); + + columns->add(column_info); + } + + file_info->set("columns", columns); + } + + return file_info; +} + +constexpr size_t FIELD_MASK_ROWS = 0x1; +constexpr size_t FIELD_MASK_NULLS = 0x2; +constexpr size_t FIELD_MASK_RECT = 0x4; +constexpr size_t FIELD_MASK_ALL = 0x7; + +void DataFileMetaInfo::serialize(WriteBuffer & out) const +{ + auto size = columns_info.size(); + writeIntBinary(size, out); + for (const auto & column : columns_info) + { + writeStringBinary(column.first, out); + size_t field_mask = 0; + if (column.second.rows_count.has_value()) + field_mask |= FIELD_MASK_ROWS; + if (column.second.nulls_count.has_value()) + field_mask |= FIELD_MASK_NULLS; + if (column.second.hyperrectangle.has_value()) + field_mask |= FIELD_MASK_RECT; + writeIntBinary(field_mask, out); + + if (column.second.rows_count.has_value()) + writeIntBinary(column.second.rows_count.value(), out); + if (column.second.nulls_count.has_value()) + writeIntBinary(column.second.nulls_count.value(), out); + if (column.second.hyperrectangle.has_value()) + { + writeFieldBinary(column.second.hyperrectangle.value().left, out); + writeFieldBinary(column.second.hyperrectangle.value().right, out); + } + } +} + +DataFileMetaInfo DataFileMetaInfo::deserialize(ReadBuffer & in) +{ + DataFileMetaInfo result; + + size_t size; + readIntBinary(size, in); + + for (size_t i = 0; i < size; ++i) + { + std::string name; + readStringBinary(name, in); + size_t field_mask; + readIntBinary(field_mask, in); + if ((field_mask & FIELD_MASK_ALL) != field_mask) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected field mask: {}", field_mask); + + ColumnInfo & column = result.columns_info[name]; + + if (field_mask & FIELD_MASK_ROWS) + { + Int64 value; + readIntBinary(value, in); + column.rows_count = value; + } + if (field_mask & FIELD_MASK_NULLS) + { + Int64 value; + readIntBinary(value, in); + column.nulls_count = value; + } + if (field_mask & FIELD_MASK_RECT) + { + FieldRef left = readFieldBinary(in); + FieldRef right = readFieldBinary(in); + column.hyperrectangle = Range(left, true, right, true); + } + } + + return result; +} + + } diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index c7421143a7cc..945618a97ad1 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include + namespace DataLake { @@ -30,7 +33,64 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; -} +}; + +namespace Iceberg +{ +struct ColumnInfo; +}; + +class DataFileMetaInfo +{ +public: + DataFileMetaInfo() = default; + + // Deserialize from json in distributed requests + explicit DataFileMetaInfo(const Poco::JSON::Object::Ptr file_info); + + // Serialize to json in distributed requests + Poco::JSON::Object::Ptr toJson() const; + + // subset of Iceberg::ColumnInfo now + struct ColumnInfo + { + std::optional rows_count; + std::optional nulls_count; + std::optional hyperrectangle; + }; + + // Extract metadata from Iceberg structure + explicit DataFileMetaInfo( + const Iceberg::IcebergSchemaProcessor & schema_processor, + Int32 schema_id, + const std::unordered_map & columns_info_); + + void serialize(WriteBuffer & out) const; + static DataFileMetaInfo deserialize(ReadBuffer & in); + + bool empty() const { return columns_info.empty(); } + + std::unordered_map columns_info; +}; + +using DataFileMetaInfoPtr = std::shared_ptr; + +struct DataFileInfo +{ + std::string file_path; + std::optional file_meta_info; + + explicit DataFileInfo(const std::string & file_path_) + : file_path(file_path_) {} + + explicit DataFileInfo(std::string && file_path_) + : file_path(std::move(file_path_)) {} + + bool operator==(const DataFileInfo & rhs) const + { + return file_path == rhs.file_path; + } +}; class SinkToStorage; using SinkToStoragePtr = std::shared_ptr; @@ -38,7 +98,6 @@ class StorageObjectStorageConfiguration; using StorageObjectStorageConfigurationPtr = std::shared_ptr; struct StorageID; struct IObjectIterator; -struct RelativePathWithMetadata; class IObjectStorage; struct ObjectInfo; using ObjectInfoPtr = std::shared_ptr; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index 6141e20cc6b5..bf22cc15ac6e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -272,6 +272,7 @@ IcebergIterator::IcebergIterator( , blocking_queue(100) , producer_task(std::nullopt) , callback(std::move(callback_)) + , table_schema_id(table_snapshot_->schema_id) { auto delete_file = deletes_iterator.next(); while (delete_file.has_value()) @@ -393,6 +394,10 @@ ObjectInfoPtr IcebergIterator::next(size_t) object_info->info.data_object_file_path_key); } + object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared( + *persistent_components.schema_processor, + table_schema_id, /// current schema id to use current column names + manifest_file_entry->parsed_entry->columns_infos)); ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos); return object_info; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h index 625b19897463..95e372ccfb5c 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h @@ -74,7 +74,7 @@ class IcebergIterator : public IObjectIterator IDataLakeMetadata::FileProgressCallback callback_, Iceberg::TableStateSnapshotPtr table_snapshot_, Iceberg::IcebergDataSnapshotPtr data_snapshot_, - Iceberg::PersistentTableComponents persistent_components); + Iceberg::PersistentTableComponents persistent_components_); ObjectInfoPtr next(size_t) override; @@ -96,6 +96,7 @@ class IcebergIterator : public IObjectIterator std::vector equality_deletes_files; std::exception_ptr exception; std::mutex exception_mutex; + Int32 table_schema_id; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 12379496f211..f8ab0a7e5f3e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -1136,12 +1136,14 @@ void IcebergMetadata::addDeleteTransformers( if (!iceberg_object_info->info.position_deletes_objects.empty()) { + LOG_DEBUG(log, "Constructing filter transform for position delete, there are {} delete objects", iceberg_object_info->info.position_deletes_objects.size()); builder.addSimpleTransform( [&](const SharedHeader & header) { return iceberg_object_info->getPositionDeleteTransformer(object_storage, header, format_settings, parser_shared_resources, local_context); }); } const auto & delete_files = iceberg_object_info->info.equality_deletes_objects; - LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size()); + if (!delete_files.empty()) + LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size()); for (const EqualityDeleteObject & delete_file : delete_files) { auto simple_transform_adder = [&](const SharedHeader & header) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h index 1ccf5240c15e..c7453440512e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h @@ -1,6 +1,23 @@ #pragma once #include "config.h" +#include +#include + +#include + +namespace DB::Iceberg +{ + +struct ColumnInfo +{ + std::optional rows_count; + std::optional bytes_size; + std::optional nulls_count; + std::optional hyperrectangle; +}; + +} #if USE_AVRO @@ -43,13 +60,6 @@ enum class ManifestFileContentType String FileContentTypeToString(FileContentType type); -struct ColumnInfo -{ - std::optional rows_count; - std::optional bytes_size; - std::optional nulls_count; -}; - struct PartitionSpecsEntry { Int32 source_id; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 9af7bb7c6fdb..e46b7272cc57 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -26,11 +26,14 @@ namespace Setting extern const SettingsBool use_hive_partitioning; extern const SettingsBool cluster_function_process_archive_on_multiple_nodes; extern const SettingsObjectStorageGranularityLevel cluster_table_function_split_granularity; + extern const SettingsUInt64 lock_object_storage_task_distribution_ms; + extern const SettingsBool allow_experimental_iceberg_read_optimization; } namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int INVALID_SETTING_VALUE; } String StorageObjectStorageCluster::getPathSample(ContextPtr context) @@ -232,6 +235,42 @@ void StorageObjectStorageCluster::updateExternalDynamicMetadataIfExists(ContextP setInMemoryMetadata(new_metadata); } +class TaskDistributor : public TaskIterator +{ +public: + TaskDistributor(std::shared_ptr iterator, + std::vector && ids_of_hosts, + bool send_over_whole_archive, + uint64_t lock_object_storage_task_distribution_ms, + ContextPtr context_, + bool iceberg_read_optimization_enabled) + : task_distributor( + iterator, + std::move(ids_of_hosts), + send_over_whole_archive, + lock_object_storage_task_distribution_ms, + iceberg_read_optimization_enabled) + , context(context_) {} + ~TaskDistributor() override = default; + bool supportRerunTask() const override { return true; } + void rescheduleTasksFromReplica(size_t number_of_current_replica) override + { + task_distributor.rescheduleTasksFromReplica(number_of_current_replica); + } + + ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const override + { + auto task = task_distributor.getNextTask(number_of_current_replica); + if (task) + return std::make_shared(std::move(task), context); + return std::make_shared(); + } + +private: + mutable StorageObjectStorageStableTaskDistributor task_distributor; + ContextPtr context; +}; + RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * filter, @@ -278,19 +317,24 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten } } - auto task_distributor = std::make_shared( - iterator, - std::move(ids_of_hosts), - /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]); + uint64_t lock_object_storage_task_distribution_ms = local_context->getSettingsRef()[Setting::lock_object_storage_task_distribution_ms]; - auto callback = std::make_shared( - [task_distributor, local_context](size_t number_of_current_replica) mutable -> ClusterFunctionReadTaskResponsePtr - { - auto task = task_distributor->getNextTask(number_of_current_replica); - if (task) - return std::make_shared(std::move(task), local_context); - return std::make_shared(); - }); + /// Check value to avoid negative result after conversion in microseconds. + /// Poco::Timestamp::TimeDiff is signed int 64. + static const uint64_t lock_object_storage_task_distribution_ms_max = 0x0020000000000000ULL; + if (lock_object_storage_task_distribution_ms > lock_object_storage_task_distribution_ms_max) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, + "Value lock_object_storage_task_distribution_ms is too big: {}, allowed maximum is {}", + lock_object_storage_task_distribution_ms, + lock_object_storage_task_distribution_ms_max + ); + + auto callback = std::make_shared(iterator, + std::move(ids_of_hosts), + /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes], + lock_object_storage_task_distribution_ms, + local_context, + /* iceberg_read_optimization_enabled */local_context->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization]); return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 2a13d2d09603..1f3e1e7419ba 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ namespace ProfileEvents extern const Event ObjectStorageGlobFilteredObjects; extern const Event ObjectStoragePredicateFilteredObjects; extern const Event ObjectStorageReadObjects; + extern const Event ObjectStorageClusterProcessedTasks; + extern const Event ObjectStorageClusterWaitingMicroseconds; } namespace CurrentMetrics @@ -82,6 +85,7 @@ namespace Setting extern const SettingsUInt64 s3_path_filter_limit; extern const SettingsBool use_parquet_metadata_cache; extern const SettingsBool input_format_parquet_use_native_reader_v3; + extern const SettingsBool allow_experimental_iceberg_read_optimization; } namespace ErrorCodes @@ -399,6 +403,16 @@ Chunk StorageObjectStorageSource::generate() }, read_context); + /// Not empty when allow_experimental_iceberg_read_optimization=true + /// and some columns were removed from read list as columns with constant values. + /// Restore data for these columns. + for (const auto & constant_column : reader.constant_columns_with_values) + { + chunk.addColumn(constant_column.first, + constant_column.second.name_and_type.type->createColumnConst( + chunk.getNumRows(), constant_column.second.value)); + } + #if USE_PARQUET if (chunk_size && chunk.hasColumns()) { @@ -545,11 +559,34 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade ObjectInfoPtr object_info; auto query_settings = configuration->getQuerySettings(context_); + bool not_a_path = false; + do { + not_a_path = false; object_info = file_iterator->next(processor); - if (!object_info || object_info->getPath().empty()) + if (!object_info) + return {}; + + if (object_info->relative_path_with_metadata.getCommand().isValid()) + { + auto retry_after_us = object_info->relative_path_with_metadata.getCommand().getRetryAfterUs(); + if (retry_after_us.has_value()) + { + not_a_path = true; + /// TODO: Make asyncronous waiting without sleep in thread + /// Now this sleep is on executor node in worker thread + /// Does not block query initiator + auto wait_time = std::min(Poco::Timestamp::TimeDiff(100000ul), retry_after_us.value()); + ProfileEvents::increment(ProfileEvents::ObjectStorageClusterWaitingMicroseconds, wait_time); + sleepForMicroseconds(wait_time); + continue; + } + object_info->relative_path_with_metadata.setFileMetaInfo(object_info->relative_path_with_metadata.getCommand().getFileMetaInfo()); + } + + if (object_info->getPath().empty()) return {}; if (!object_info->getObjectMetadata()) @@ -568,16 +605,28 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade else object_info->setObjectMetadata(object_storage->getObjectMetadata(path, with_tags)); } - } while (query_settings.skip_empty_files - && object_info->getObjectMetadata()->size_bytes == 0 - && object_info->getObjectMetadata()->is_size_known); + } + while (not_a_path + || (query_settings.skip_empty_files + && object_info->getObjectMetadata()->size_bytes == 0 + && object_info->getObjectMetadata()->is_size_known)); + + ProfileEvents::increment(ProfileEvents::ObjectStorageClusterProcessedTasks); QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; + std::optional rows_count_from_metadata; auto try_get_num_rows_from_cache = [&]() -> std::optional { + if (rows_count_from_metadata.has_value()) + { + /// Must be non negative here + size_t value = rows_count_from_metadata.value(); + return value; + } + if (!schema_cache) return std::nullopt; @@ -595,6 +644,128 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade return schema_cache->tryGetNumRows(cache_key, get_last_mod_time); }; + /// List of columns with constant value in current file, and values + std::map constant_columns_with_values; + std::unordered_set constant_columns; + + NamesAndTypesList requested_columns_copy = read_from_format_info.requested_columns; + + std::unordered_map> requested_columns_list; + { + size_t column_index = 0; + for (const auto & column : requested_columns_copy) + requested_columns_list[column.getNameInStorage()] = std::make_pair(column_index++, column); + } + + if (context_->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization]) + { + auto file_meta_data = object_info->relative_path_with_metadata.getFileMetaInfo(); + if (file_meta_data.has_value()) + { + bool is_all_rows_count_equals = true; + for (const auto & column : file_meta_data.value()->columns_info) + { + if (is_all_rows_count_equals && column.second.rows_count.has_value()) + { + if (rows_count_from_metadata.has_value()) + { + if (column.second.rows_count.value() != rows_count_from_metadata.value()) + { + LOG_WARNING(log, "Inconsistent rows count for file {} in metadats, ignored", object_info->getPath()); + is_all_rows_count_equals = false; + rows_count_from_metadata = std::nullopt; + } + } + else if (column.second.rows_count.value() < 0) + { + LOG_WARNING(log, "Negative rows count for file {} in metadats, ignored", object_info->getPath()); + is_all_rows_count_equals = false; + rows_count_from_metadata = std::nullopt; + } + else + rows_count_from_metadata = column.second.rows_count; + } + + if (column.second.hyperrectangle.has_value()) + { + auto column_name = column.first; + + auto i_column = requested_columns_list.find(column_name); + if (i_column == requested_columns_list.end()) + continue; + + if (column.second.hyperrectangle.value().isPoint() && + (!column.second.nulls_count.has_value() || column.second.nulls_count.value() <= 0)) + { + /// isPoint() method checks before that left==right + constant_columns_with_values[i_column->second.first] = + ConstColumnWithValue{ + i_column->second.second, + column.second.hyperrectangle.value().left + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value '{}'", + object_info->getPath(), + column_name, + i_column->second.second.type, + column.second.hyperrectangle.value().left.dump()); + } + else if (column.second.rows_count.has_value() && column.second.nulls_count.has_value() + && column.second.rows_count.value() == column.second.nulls_count.value() + && i_column->second.second.type->isNullable()) + { + constant_columns_with_values[i_column->second.first] = + ConstColumnWithValue{ + i_column->second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + i_column->second.second.type); + } + } + } + for (const auto & column : requested_columns_list) + { + const auto & column_name = column.first; + + if (file_meta_data.value()->columns_info.contains(column_name)) + continue; + + if (!column.second.second.type->isNullable()) + continue; + + /// Column is nullable and absent in file + constant_columns_with_values[column.second.first] = + ConstColumnWithValue{ + column.second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + column.second.second.type); + } + } + + if (!constant_columns.empty()) + { + size_t original_columns = requested_columns_copy.size(); + requested_columns_copy = requested_columns_copy.eraseNames(constant_columns); + if (requested_columns_copy.size() + constant_columns.size() != original_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't remove constant columns for file {} correct, fallback to read. Founded constant columns: [{}]", + object_info->getPath(), constant_columns); + if (requested_columns_copy.empty()) + need_only_count = true; + } + } + std::optional num_rows_from_cache = need_only_count && context_->getSettingsRef()[Setting::use_cache_for_count_from_files] ? try_get_num_rows_from_cache() : std::nullopt; @@ -612,6 +783,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade columns.emplace_back(type->createColumn(), type, name); builder.init(Pipe(std::make_shared( std::make_shared(columns), *num_rows_from_cache, max_block_size))); + if (!constant_columns.empty()) + configuration->addDeleteTransformers(object_info, builder, format_settings, parser_shared_resources, context_); } else { @@ -766,7 +939,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade /// from chunk read by IInputFormat. builder.addSimpleTransform([&](const SharedHeader & header) { - return std::make_shared(header, read_from_format_info.requested_columns); + return std::make_shared(header, requested_columns_copy); }); auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); @@ -775,7 +948,12 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); return ReaderHolder( - object_info, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)); + object_info, + std::move(read_buf), + std::move(source), + std::move(pipeline), + std::move(current_reader), + std::move(constant_columns_with_values)); } std::future StorageObjectStorageSource::createReaderAsync() @@ -1242,12 +1420,14 @@ StorageObjectStorageSource::ReaderHolder::ReaderHolder( std::unique_ptr read_buf_, std::shared_ptr source_, std::unique_ptr pipeline_, - std::unique_ptr reader_) + std::unique_ptr reader_, + std::map && constant_columns_with_values_) : object_info(std::move(object_info_)) , read_buf(std::move(read_buf_)) , source(std::move(source_)) , pipeline(std::move(pipeline_)) , reader(std::move(reader_)) + , constant_columns_with_values(std::move(constant_columns_with_values_)) { } @@ -1261,6 +1441,7 @@ StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexc source = std::move(other.source); read_buf = std::move(other.read_buf); object_info = std::move(other.object_info); + constant_columns_with_values = std::move(other.constant_columns_with_values); return *this; } @@ -1275,6 +1456,12 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( , is_archive(is_archive_) , object_storage(object_storage_) { + if (!getContext()->isSwarmModeEnabled()) + { + LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks"); + return; + } + ThreadPool pool( CurrentMetrics::StorageObjectStorageThreads, CurrentMetrics::StorageObjectStorageThreadsActive, @@ -1310,6 +1497,12 @@ ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::next(size_t) ObjectInfoPtr object_info; if (current_index >= buffer.size()) { + if (!getContext()->isSwarmModeEnabled()) + { + LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks"); + return nullptr; + } + auto task = callback(); if (!task || task->isEmpty()) return nullptr; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 5cbe931a047e..1f8a712940af 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -89,6 +89,12 @@ class StorageObjectStorageSource : public ISource size_t total_files_read = 0; LoggerPtr log = getLogger("StorageObjectStorageSource"); + struct ConstColumnWithValue + { + NameAndTypePair name_and_type; + Field value; + }; + struct ReaderHolder : private boost::noncopyable { public: @@ -97,7 +103,8 @@ class StorageObjectStorageSource : public ISource std::unique_ptr read_buf_, std::shared_ptr source_, std::unique_ptr pipeline_, - std::unique_ptr reader_); + std::unique_ptr reader_, + std::map && constant_columns_with_values_); ReaderHolder() = default; ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); } @@ -116,6 +123,9 @@ class StorageObjectStorageSource : public ISource std::shared_ptr source; std::unique_ptr pipeline; std::unique_ptr reader; + + public: + std::map constant_columns_with_values; }; ReaderHolder reader; @@ -255,7 +265,7 @@ class StorageObjectStorageSource::KeysIterator : public IObjectIterator const ObjectStoragePtr object_storage; const NamesAndTypesList virtual_columns; const std::function file_progress_callback; - const std::vector keys; + const Strings keys; std::atomic index = 0; const bool ignore_non_existent_files; const bool skip_object_metadata; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp index 5a54601b171c..c661bf572a08 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp @@ -3,40 +3,71 @@ #include #include +namespace ProfileEvents +{ + extern const Event ObjectStorageClusterSentToMatchedReplica; + extern const Event ObjectStorageClusterSentToNonMatchedReplica; +}; + namespace DB { namespace ErrorCodes { extern const int LOGICAL_ERROR; -} + extern const int CANNOT_READ_ALL_DATA; +}; StorageObjectStorageStableTaskDistributor::StorageObjectStorageStableTaskDistributor( std::shared_ptr iterator_, std::vector && ids_of_nodes_, - bool send_over_whole_archive_) + bool send_over_whole_archive_, + uint64_t lock_object_storage_task_distribution_ms_, + bool iceberg_read_optimization_enabled_) : iterator(std::move(iterator_)) , send_over_whole_archive(send_over_whole_archive_) , connection_to_files(ids_of_nodes_.size()) , ids_of_nodes(std::move(ids_of_nodes_)) + , lock_object_storage_task_distribution_us(lock_object_storage_task_distribution_ms_ * 1000) , iterator_exhausted(false) + , iceberg_read_optimization_enabled(iceberg_read_optimization_enabled_) { + Poco::Timestamp now; + size_t nodes = ids_of_nodes.size(); + for (size_t i = 0; i < nodes; ++i) + { + replica_to_files_to_be_processed[i] = std::list{}; + last_node_activity[i] = now; + } } ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getNextTask(size_t number_of_current_replica) { LOG_TRACE(log, "Received request from replica {} looking for a file", number_of_current_replica); - // 1. Check pre-queued files first - if (auto file = getPreQueuedFile(number_of_current_replica)) - return file; + saveLastNodeActivity(number_of_current_replica); - // 2. Try to find a matching file from the iterator - if (auto file = getMatchingFileFromIterator(number_of_current_replica)) - return file; + auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica); + if (processed_file_list_ptr == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Replica number {} was marked as lost, can't set task for it anymore", + number_of_current_replica + ); + // 1. Check pre-queued files first + auto file = getPreQueuedFile(number_of_current_replica); + // 2. Try to find a matching file from the iterator + if (!file) + file = getMatchingFileFromIterator(number_of_current_replica); // 3. Process unprocessed files if iterator is exhausted - return getAnyUnprocessedFile(number_of_current_replica); + if (!file) + file = getAnyUnprocessedFile(number_of_current_replica); + + if (file) + processed_file_list_ptr->second.push_back(file); + + return file; } size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String & file_path) @@ -48,16 +79,27 @@ size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String return 0; /// Rendezvous hashing - size_t best_id = 0; - UInt64 best_weight = sipHash64(ids_of_nodes[0] + file_path); - for (size_t id = 1; id < nodes_count; ++id) + auto replica = replica_to_files_to_be_processed.begin(); + if (replica == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "No active replicas, can't find best replica for file {}", + file_path + ); + + size_t best_id = replica->first; + UInt64 best_weight = sipHash64(ids_of_nodes[best_id] + file_path); + ++replica; + while (replica != replica_to_files_to_be_processed.end()) { + size_t id = replica->first; UInt64 weight = sipHash64(ids_of_nodes[id] + file_path); if (weight > best_weight) { best_weight = weight; best_id = id; } + ++replica; } return best_id; } @@ -95,6 +137,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getPreQueuedFile(size_t number_of_current_replica ); + ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica); return next_file; } @@ -138,6 +181,17 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter file_identifier = object_info->getIdentifier(); } + if (iceberg_read_optimization_enabled) + { + auto file_meta_info = object_info->relative_path_with_metadata.getFileMetaInfo(); + if (file_meta_info.has_value()) + { + auto file_path = send_over_whole_archive ? object_info->getPathOrPathToArchiveIfArchive() : object_info->getPath(); + object_info->relative_path_with_metadata.command.setFilePath(file_path); + object_info->relative_path_with_metadata.command.setFileMetaInfo(file_meta_info.value()); + } + } + size_t file_replica_idx = getReplicaForFile(file_identifier); if (file_replica_idx == number_of_current_replica) { @@ -146,6 +200,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter file_identifier, number_of_current_replica ); + ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica); return object_info; } LOG_TEST( @@ -159,7 +214,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter // Queue file for its assigned replica { std::lock_guard lock(mutex); - unprocessed_files.emplace(file_identifier, object_info); + unprocessed_files.emplace(file_identifier, std::make_pair(object_info, file_replica_idx)); connection_to_files[file_replica_idx].push_back(object_info); } } @@ -169,26 +224,97 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getAnyUnprocessedFile(size_t number_of_current_replica) { + /// Limit time of node activity to keep task in queue + Poco::Timestamp activity_limit; + Poco::Timestamp oldest_activity; + if (lock_object_storage_task_distribution_us > 0) + activity_limit -= lock_object_storage_task_distribution_us; + std::lock_guard lock(mutex); if (!unprocessed_files.empty()) { auto it = unprocessed_files.begin(); - auto next_file = it->second; - unprocessed_files.erase(it); - auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath(); + while (it != unprocessed_files.end()) + { + auto number_of_matched_replica = it->second.second; + auto last_activity = last_node_activity.find(number_of_matched_replica); + if (lock_object_storage_task_distribution_us <= 0 // file deferring is turned off + || it->second.second == number_of_current_replica // file is matching with current replica + || last_activity == last_node_activity.end() // msut never be happen, last_activity is filled for each replica on start + || activity_limit > last_activity->second) // matched replica did not ask for a new files for a while + { + auto next_file = it->second.first; + unprocessed_files.erase(it); + + auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath(); + LOG_TRACE( + log, + "Iterator exhausted. Assigning unprocessed file {} to replica {} from matched replica {}", + file_path, + number_of_current_replica, + number_of_matched_replica + ); + + ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToNonMatchedReplica); + return next_file; + } + + oldest_activity = std::min(oldest_activity, last_activity->second); + ++it; + } + LOG_TRACE( log, - "Iterator exhausted. Assigning unprocessed file {} to replica {}", - file_path, - number_of_current_replica + "No unprocessed file for replica {}, need to retry after {} us", + number_of_current_replica, + oldest_activity - activity_limit ); - return next_file; + /// All unprocessed files owned by alive replicas with recenlty activity + /// Need to retry after (oldest_activity - activity_limit) microseconds + RelativePathWithMetadata::CommandInTaskResponse response; + response.setRetryAfterUs(oldest_activity - activity_limit); + return std::make_shared(response.toString()); } return {}; } +void StorageObjectStorageStableTaskDistributor::saveLastNodeActivity(size_t number_of_current_replica) +{ + Poco::Timestamp now; + std::lock_guard lock(mutex); + last_node_activity[number_of_current_replica] = now; +} + +void StorageObjectStorageStableTaskDistributor::rescheduleTasksFromReplica(size_t number_of_current_replica) +{ + LOG_INFO(log, "Replica {} is marked as lost, tasks are returned to queue", number_of_current_replica); + std::lock_guard lock(mutex); + + auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica); + if (processed_file_list_ptr == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Replica number {} was marked as lost already", + number_of_current_replica + ); + + if (replica_to_files_to_be_processed.size() < 2) + throw Exception( + ErrorCodes::CANNOT_READ_ALL_DATA, + "All replicas were marked as lost" + ); + + for (const auto & file : processed_file_list_ptr->second) + { + auto file_replica_idx = getReplicaForFile(file->getPath()); + unprocessed_files.emplace(file->getPath(), std::make_pair(file, file_replica_idx)); + connection_to_files[file_replica_idx].push_back(file); + } + replica_to_files_to_be_processed.erase(number_of_current_replica); +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h index 02d3ba7a030f..0cd10ac7188e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h @@ -4,7 +4,13 @@ #include #include #include +#include + +#include + #include +#include +#include #include #include #include @@ -18,26 +24,38 @@ class StorageObjectStorageStableTaskDistributor StorageObjectStorageStableTaskDistributor( std::shared_ptr iterator_, std::vector && ids_of_nodes_, - bool send_over_whole_archive_); + bool send_over_whole_archive_, + uint64_t lock_object_storage_task_distribution_ms_, + bool iceberg_read_optimization_enabled_); ObjectInfoPtr getNextTask(size_t number_of_current_replica); + /// Insert objects back to unprocessed files + void rescheduleTasksFromReplica(size_t number_of_current_replica); + private: size_t getReplicaForFile(const String & file_path); ObjectInfoPtr getPreQueuedFile(size_t number_of_current_replica); ObjectInfoPtr getMatchingFileFromIterator(size_t number_of_current_replica); ObjectInfoPtr getAnyUnprocessedFile(size_t number_of_current_replica); + void saveLastNodeActivity(size_t number_of_current_replica); + const std::shared_ptr iterator; const bool send_over_whole_archive; std::vector> connection_to_files; - std::unordered_map unprocessed_files; + std::unordered_map> unprocessed_files; std::vector ids_of_nodes; + std::unordered_map last_node_activity; + Poco::Timestamp::TimeDiff lock_object_storage_task_distribution_us; + std::unordered_map> replica_to_files_to_be_processed; + std::mutex mutex; bool iterator_exhausted = false; + bool iceberg_read_optimization_enabled = false; LoggerPtr log = getLogger("StorageClusterTaskDistributor"); }; diff --git a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp index b00c1d609fa1..c9fc2831b214 100644 --- a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp +++ b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp @@ -101,7 +101,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10)); ASSERT_TRUE(checkHead(paths, {6})); @@ -110,7 +110,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10)); ASSERT_TRUE(checkHead(paths, {0, 2, 4})); @@ -119,7 +119,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 2, 10)); ASSERT_TRUE(checkHead(paths, {1, 5, 7, 8})); @@ -128,7 +128,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 3, 10)); ASSERT_TRUE(checkHead(paths, {3, 9})); @@ -139,7 +139,7 @@ TEST(RendezvousHashing, MultipleNodes) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); { std::vector paths; @@ -171,7 +171,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10)); ASSERT_TRUE(checkHead(paths, {1, 5, 6, 7, 8, 9})); @@ -180,7 +180,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10)); ASSERT_TRUE(checkHead(paths, {0, 2, 3, 4})); @@ -191,7 +191,7 @@ TEST(RendezvousHashing, MultipleNodesReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); { std::vector paths; @@ -210,7 +210,7 @@ TEST(RendezvousHashing, MultipleNodesReducedClusterOneByOne) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false); std::vector paths0; std::vector paths1; diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 085cdb0572c3..d94d87412e0e 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -95,18 +95,49 @@ void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const Sto ); } -RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension( - const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const +class FileTaskIterator : public TaskIterator { - auto iterator = std::make_shared(paths, std::nullopt, predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); - auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr +public: + FileTaskIterator(const Strings & files, + std::optional archive_info, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, + const ContextPtr & context, + bool distributed_processing = false) + : iterator(files + , archive_info + , predicate + , virtual_columns + , hive_partition_columns_to_read_from_file_path + , context + , distributed_processing) {} + + ~FileTaskIterator() override = default; + + ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override { - auto file = iter->next(); + auto file = iterator.next(); if (file.empty()) return std::make_shared(); return std::make_shared(std::move(file)); - }; - auto callback = std::make_shared(std::move(next_callback)); + } + +private: + mutable StorageFileSource::FilesIterator iterator; +}; + +RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension( + const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const +{ + auto callback = std::make_shared( + paths, + std::nullopt, + predicate, + getVirtualsList(), + hive_partition_columns_to_read_from_file_path, + context + ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 18ed46e9e005..867c6831deac 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -130,20 +130,42 @@ void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageS ); } -RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension( - const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const +class UrlTaskIterator : public TaskIterator { - auto iterator = std::make_shared( - uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); - - auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr +public: + UrlTaskIterator(const String & uri, + size_t max_addresses, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, + const ContextPtr & context) + : iterator(uri, max_addresses, predicate, virtual_columns, hive_partition_columns_to_read_from_file_path, context) {} + + ~UrlTaskIterator() override = default; + + ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override { - auto url = iter->next(); + auto url = iterator.next(); if (url.empty()) return std::make_shared(); return std::make_shared(std::move(url)); - }; - auto callback = std::make_shared(std::move(next_callback)); + } + +private: + mutable StorageURLSource::DisclosedGlobIterator iterator; +}; + +RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension( + const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const +{ + auto callback = std::make_shared( + uri, + context->getSettingsRef()[Setting::glob_expansion_max_elements], + predicate, + getVirtualsList(), + hive_partition_columns_to_read_from_file_path, + context + ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/tests/integration/test_s3_cache_locality/__init__.py b/tests/integration/test_s3_cache_locality/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_s3_cache_locality/configs/cluster.xml b/tests/integration/test_s3_cache_locality/configs/cluster.xml new file mode 100644 index 000000000000..db54c35374b9 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/cluster.xml @@ -0,0 +1,126 @@ + + + + + + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + + + + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + + + + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + + + + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + + + + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + + + + + + + /var/lib/clickhouse/raw_s3_cache + 10Gi + + + diff --git a/tests/integration/test_s3_cache_locality/configs/named_collections.xml b/tests/integration/test_s3_cache_locality/configs/named_collections.xml new file mode 100644 index 000000000000..6994aa3f5e77 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/named_collections.xml @@ -0,0 +1,10 @@ + + + + http://minio1:9001/root/data/* + minio + ClickHouse_Minio_P@ssw0rd + CSV> + + + diff --git a/tests/integration/test_s3_cache_locality/configs/users.xml b/tests/integration/test_s3_cache_locality/configs/users.xml new file mode 100644 index 000000000000..4b6ba057ecb1 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/users.xml @@ -0,0 +1,9 @@ + + + + + default + 1 + + + diff --git a/tests/integration/test_s3_cache_locality/test.py b/tests/integration/test_s3_cache_locality/test.py new file mode 100644 index 000000000000..68993d85aeed --- /dev/null +++ b/tests/integration/test_s3_cache_locality/test.py @@ -0,0 +1,262 @@ +import csv +import logging +import os +import shutil +import uuid + +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.config_cluster import minio_secret_key + + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + + +def create_buckets_s3(cluster, files=1000): + minio = cluster.minio_client + + s3_data = [] + + for file_number in range(files): + file_name = f"data/generated_{files}/file_{file_number}.csv" + os.makedirs(os.path.join(SCRIPT_DIR, f"data/generated_{files}/"), exist_ok=True) + s3_data.append(file_name) + with open(os.path.join(SCRIPT_DIR, file_name), "w+", encoding="utf-8") as f: + # a String, b UInt64 + data = [] + + # Make all files a bit different + data.append( + ["str_" + str(file_number), file_number] + ) + + writer = csv.writer(f) + writer.writerows(data) + + for file in s3_data: + minio.fput_object( + bucket_name=cluster.minio_bucket, + object_name=file, + file_path=os.path.join(SCRIPT_DIR, file), + ) + + for obj in minio.list_objects(cluster.minio_bucket, recursive=True): + print(obj.object_name) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster = ClickHouseCluster(__file__) + # clickhouse0 not a member of cluster_XXX + for i in range(6): + cluster.add_instance( + f"clickhouse{i}", + main_configs=["configs/cluster.xml", "configs/named_collections.xml"], + user_configs=["configs/users.xml"], + macros={"replica": f"clickhouse{i}"}, + with_minio=True, + with_zookeeper=True, + stay_alive=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + create_buckets_s3(cluster) + create_buckets_s3(cluster, files=3) + + yield cluster + finally: + shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_1000/"), ignore_errors=True) + shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_3/"), ignore_errors=True) + cluster.shutdown() + + +def check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, + lock_object_storage_task_distribution_ms, files=1000): + for host in list(cluster.instances.values()): + host.query("SYSTEM DROP FILESYSTEM CACHE 'raw_s3_cache'", ignore_error=True) + + settings = { + "enable_filesystem_cache": enable_filesystem_cache, + "filesystem_cache_name": "'raw_s3_cache'", + } + + settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms + + query_id_first = str(uuid.uuid4()) + result_first = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster_first}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id_first, + ) + assert result_first == expected_result + query_id_second = str(uuid.uuid4()) + result_second = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster_second}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id_second, + ) + assert result_second == expected_result + + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_first}") + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_second}") + + s3_get_first = node.query( + f""" + SELECT sum(ProfileEvents['S3GetObject']) + FROM clusterAllReplicas('{cluster_first}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id_first}' + """, + ) + s3_get_second = node.query( + f""" + SELECT sum(ProfileEvents['S3GetObject']) + FROM clusterAllReplicas('{cluster_second}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id_second}' + """, + ) + + return int(s3_get_first), int(s3_get_second) + + +def check_s3_gets_by_hosts(cluster, node, expected_result, + lock_object_storage_task_distribution_ms, files=1000): + settings = { + "enable_filesystem_cache": False, + } + + settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms + query_id = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id, + ) + assert result == expected_result + + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster}") + + s3_get = node.query( + f""" + SELECT ProfileEvents['S3GetObject'] + FROM clusterAllReplicas('{cluster}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id}' + ORDER BY hostname + """, + ) + + return [int(events) for events in s3_get.strip().split("\n")] + + +def check_s3_gets_repeat(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, + lock_object_storage_task_distribution_ms): + # Repeat test several times to get average result + iterations = 1 if lock_object_storage_task_distribution_ms > 0 else 10 + s3_get_first_sum = 0 + s3_get_second_sum = 0 + for _ in range(iterations): + (s3_get_first, s3_get_second) = check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, lock_object_storage_task_distribution_ms) + s3_get_first_sum += s3_get_first + s3_get_second_sum += s3_get_second + return s3_get_first_sum, s3_get_second_sum + + +@pytest.mark.parametrize("lock_object_storage_task_distribution_ms ", [0, 30000]) +def test_cache_locality(started_cluster, lock_object_storage_task_distribution_ms): + node = started_cluster.instances["clickhouse0"] + + expected_result = node.query( + f""" + SELECT count(*) + FROM s3('http://minio1:9001/root/data/generated_1000/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + """ + ) + + # Algorithm does not give 100% guarantee, so add 10% on dispersion + dispersion = 0.0 if lock_object_storage_task_distribution_ms > 0 else 0.1 + + # No cache + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 0, lock_object_storage_task_distribution_ms) + assert s3_get_second == s3_get_first + + # With cache + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * dispersion + + # Different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_34512', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * dispersion + + # No last replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_1234', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.179 + dispersion) # actual value - 179 of 1000 files changed replica + + # No first replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_2345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) # actual value - 189 of 1000 files changed replica + + # No first replica, different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_4523', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) + + # Add new replica, different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_4523', 'cluster_12345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) + + # New replica and old replica, different replicas order + # All files from removed replica changed replica + # Some files from existed replicas changed replica on the new replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_1234', 'cluster_4523', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.368 + dispersion) # actual value - 368 of 1000 changed replica + + if (lock_object_storage_task_distribution_ms > 0): + s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [189,210,220,202,179] + s3_get = check_s3_gets_by_hosts('cluster_1234', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [247,243,264,246] + s3_get = check_s3_gets_by_hosts('cluster_2345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [251,280,248,221] + + +def test_cache_locality_few_files(started_cluster): + node = started_cluster.instances["clickhouse0"] + + expected_result = node.query( + f""" + SELECT count(*) + FROM s3('http://minio1:9001/root/data/generated_3/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + """ + ) + + # Rendezvous hash makes the next distribution: + # file_0 - clickhouse1 + # file_1 - clickhouse4 + # file_2 - clickhouse3 + # The same distribution must be in each query + for _ in range(10): + s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms=30000, files=3) + assert s3_get == [1,0,1,1,0] diff --git a/tests/integration/test_s3_cluster/data/graceful/part0.csv b/tests/integration/test_s3_cluster/data/graceful/part0.csv new file mode 100644 index 000000000000..2a8ceabbea58 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part0.csv @@ -0,0 +1 @@ +0,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part1.csv b/tests/integration/test_s3_cluster/data/graceful/part1.csv new file mode 100644 index 000000000000..1950012fffd2 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part1.csv @@ -0,0 +1 @@ +1,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part2.csv b/tests/integration/test_s3_cluster/data/graceful/part2.csv new file mode 100644 index 000000000000..dc782d5adf9b --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part2.csv @@ -0,0 +1 @@ +2,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part3.csv b/tests/integration/test_s3_cluster/data/graceful/part3.csv new file mode 100644 index 000000000000..6e581549d23c --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part3.csv @@ -0,0 +1 @@ +3,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part4.csv b/tests/integration/test_s3_cluster/data/graceful/part4.csv new file mode 100644 index 000000000000..bb5a4d956c51 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part4.csv @@ -0,0 +1 @@ +4,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part5.csv b/tests/integration/test_s3_cluster/data/graceful/part5.csv new file mode 100644 index 000000000000..5cb2c6be144b --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part5.csv @@ -0,0 +1 @@ +5,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part6.csv b/tests/integration/test_s3_cluster/data/graceful/part6.csv new file mode 100644 index 000000000000..e2e2428d100d --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part6.csv @@ -0,0 +1 @@ +6,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part7.csv b/tests/integration/test_s3_cluster/data/graceful/part7.csv new file mode 100644 index 000000000000..3c819a315c20 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part7.csv @@ -0,0 +1 @@ +7,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part8.csv b/tests/integration/test_s3_cluster/data/graceful/part8.csv new file mode 100644 index 000000000000..72f39e512be3 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part8.csv @@ -0,0 +1 @@ +8,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part9.csv b/tests/integration/test_s3_cluster/data/graceful/part9.csv new file mode 100644 index 000000000000..f288cb2051dd --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part9.csv @@ -0,0 +1 @@ +9,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partA.csv b/tests/integration/test_s3_cluster/data/graceful/partA.csv new file mode 100644 index 000000000000..da99f68ba784 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partA.csv @@ -0,0 +1 @@ +10,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partB.csv b/tests/integration/test_s3_cluster/data/graceful/partB.csv new file mode 100644 index 000000000000..46591e0be815 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partB.csv @@ -0,0 +1 @@ +11,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partC.csv b/tests/integration/test_s3_cluster/data/graceful/partC.csv new file mode 100644 index 000000000000..24af8010b5c6 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partC.csv @@ -0,0 +1 @@ +12,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partD.csv b/tests/integration/test_s3_cluster/data/graceful/partD.csv new file mode 100644 index 000000000000..0365a5024871 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partD.csv @@ -0,0 +1 @@ +13,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partE.csv b/tests/integration/test_s3_cluster/data/graceful/partE.csv new file mode 100644 index 000000000000..3143c0eed915 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partE.csv @@ -0,0 +1 @@ +14,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partF.csv b/tests/integration/test_s3_cluster/data/graceful/partF.csv new file mode 100644 index 000000000000..d0306b9bb806 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partF.csv @@ -0,0 +1 @@ +15,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 76b8f0df2881..9c817209b8ba 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -2,11 +2,13 @@ import logging import os import shutil +import threading import time from email.errors import HeaderParseError import pytest +from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster from helpers.config_cluster import minio_secret_key from helpers.mock_servers import start_mock_servers @@ -21,6 +23,22 @@ "data/clickhouse/part123.csv", "data/database/part2.csv", "data/database/partition675.csv", + "data/graceful/part0.csv", + "data/graceful/part1.csv", + "data/graceful/part2.csv", + "data/graceful/part3.csv", + "data/graceful/part4.csv", + "data/graceful/part5.csv", + "data/graceful/part6.csv", + "data/graceful/part7.csv", + "data/graceful/part8.csv", + "data/graceful/part9.csv", + "data/graceful/partA.csv", + "data/graceful/partB.csv", + "data/graceful/partC.csv", + "data/graceful/partD.csv", + "data/graceful/partE.csv", + "data/graceful/partF.csv", ] @@ -76,6 +94,7 @@ def started_cluster(): macros={"replica": "node1", "shard": "shard1"}, with_minio=True, with_zookeeper=True, + stay_alive=True, ) cluster.add_instance( "s0_0_1", @@ -83,6 +102,7 @@ def started_cluster(): user_configs=["configs/users.xml"], macros={"replica": "replica2", "shard": "shard1"}, with_zookeeper=True, + stay_alive=True, ) cluster.add_instance( "s0_1_0", @@ -90,6 +110,7 @@ def started_cluster(): user_configs=["configs/users.xml"], macros={"replica": "replica1", "shard": "shard2"}, with_zookeeper=True, + stay_alive=True, ) logging.info("Starting cluster...") @@ -509,3 +530,63 @@ def test_cluster_default_expression(started_cluster): ) assert result == expected_result + + +def test_graceful_shutdown(started_cluster): + node = started_cluster.instances["s0_0_0"] + node_to_shutdown = started_cluster.instances["s0_1_0"] + + expected = TSV("64\tBar\t8\n56\tFoo\t8\n") + + num_lock = threading.Lock() + errors = 0 + + def query_cycle(): + nonlocal errors + try: + i = 0 + while i < 10: + i += 1 + # Query time 3-4 seconds + # Processing single object 1-2 seconds + result = node.query(f""" + SELECT sum(value),name,sum(sleep(1)+1) as sleep FROM s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/graceful/*', 'minio', '{minio_secret_key}', 'CSV', + 'value UInt32, name String') + GROUP BY name + ORDER BY name + SETTINGS max_threads=2 + """) + with num_lock: + if TSV(result) != expected: + errors += 1 + if errors >= 1: + break + except QueryRuntimeException: + with num_lock: + errors += 1 + + threads = [] + + for _ in range(10): + thread = threading.Thread(target=query_cycle) + thread.start() + threads.append(thread) + time.sleep(0.2) + + time.sleep(3) + + node_to_shutdown.query("SYSTEM STOP SWARM MODE") + + # enough time to complete processing of objects, started before "SYSTEM STOP SWARM MODE" + time.sleep(3) + + node_to_shutdown.stop_clickhouse(kill=True) + + for thread in threads: + thread.join() + + node_to_shutdown.start_clickhouse() + + assert errors == 0 diff --git a/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py new file mode 100644 index 000000000000..bdca97add2e5 --- /dev/null +++ b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py @@ -0,0 +1,215 @@ +import pytest + +from helpers.iceberg_utils import ( + get_uuid_str, + get_creation_expression, + execute_spark_query_general, +) + + +@pytest.mark.parametrize("storage_type", ["s3", "azure"]) +@pytest.mark.parametrize("run_on_cluster", [False, True]) +def test_read_constant_columns_optimization(started_cluster_iceberg_with_spark, storage_type, run_on_cluster): + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_read_constant_columns_optimization_" + storage_type + "_" + get_uuid_str() + + def execute_spark_query(query: str): + return execute_spark_query_general( + spark, + started_cluster_iceberg_with_spark, + storage_type, + TABLE_NAME, + query, + ) + + execute_spark_query( + f""" + CREATE TABLE {TABLE_NAME} ( + tag INT, + date DATE, + date2 DATE, + name VARCHAR(50), + number BIGINT + ) + USING iceberg + PARTITIONED BY (identity(tag), years(date)) + OPTIONS('format-version'='2') + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (1, DATE '2024-01-20', DATE '2024-01-20', 'vasya', 5), + (1, DATE '2024-01-20', DATE '2024-01-20', 'vasilisa', 5), + (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5), + (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5), + (2, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 5), + (2, DATE '2025-01-21', DATE '2025-01-20', 'vasilisa', 5) + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} ALTER COLUMN number FIRST; + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (5, 3, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa'), + (5, 3, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa') + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} RENAME COLUMN name TO name_old; + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} + ADD COLUMNS ( + name string + ); + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (5, 4, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 4, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 'iceberg'), + (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa', 'icebreaker'), + (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg') + """ + ) + + # Totally must be 7 files + # Partitioned column 'tag' is constant in each file + # Column 'date' is constant in 6 files, has different values in (2-2025) + # Column 'date2' is constant in 4 files (1-2024, 2-2025, 5-2025, 6-2025) + # Column 'name_old' is constant in 3 files (1-2025, 2-2025 as 'name', 6-2025 as 'name_old') + # Column 'number' is globally constant + # New column 'name2' is present only in 3 files (4-2025, 5-2025, 6-2025), constant in two (4-2025, 6-2025) + # Files 1-2025 and 6-2025 have only constant columns + + creation_expression = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True, run_on_cluster=run_on_cluster + ) + + # Warm up metadata cache + for replica in started_cluster_iceberg_with_spark.instances.values(): + replica.query(f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0") + + all_data_expected_query_id = get_uuid_str() + all_data_expected = instance.query( + f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=all_data_expected_query_id, + ) + const_only_expected_query_id = get_uuid_str() + const_only_expected = instance.query( + f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_only_expected_query_id, + ) + const_partial_expected_query_id = get_uuid_str() + const_partial_expected = instance.query( + f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_partial_expected_query_id, + ) + const_partial2_expected_query_id = get_uuid_str() + const_partial2_expected = instance.query( + f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_partial2_expected_query_id, + ) + count_expected_query_id = get_uuid_str() + count_expected = instance.query( + f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=count_expected_query_id, + ) + + all_data_query_id = get_uuid_str() + all_data_optimized = instance.query( + f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=all_data_query_id, + ) + const_only_query_id = get_uuid_str() + const_only_optimized = instance.query( + f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_only_query_id, + ) + const_partial_query_id = get_uuid_str() + const_partial_optimized = instance.query( + f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_partial_query_id, + ) + const_partial2_query_id = get_uuid_str() + const_partial2_optimized = instance.query( + f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_partial2_query_id, + ) + count_query_id = get_uuid_str() + count_optimized = instance.query( + f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=count_query_id, + ) + + assert all_data_expected == all_data_optimized + assert const_only_expected == const_only_optimized + assert const_partial_expected == const_partial_optimized + assert const_partial2_expected == const_partial2_optimized + assert count_expected == count_optimized + + for replica in started_cluster_iceberg_with_spark.instances.values(): + replica.query("SYSTEM FLUSH LOGS") + + def check_events(query_id, event, is_cluster, expected): + res = instance.query( + f""" + SELECT + sum(tupleElement(arrayJoin(ProfileEvents),2)) as value + FROM + clusterAllReplicas('cluster_simple', system.query_log) + WHERE + type='QueryFinish' + AND tupleElement(arrayJoin(ProfileEvents),1)='{event}' + AND initial_query_id='{query_id}' + GROUP BY ALL + FORMAT CSV + """) + # Weird, bu looks like ReadFileMetadata does not used local file cache in 26.1 + # metadata.json always downloaded in 26.1, once per query or subquery + # In 25.8 count was equal to expected, in 26.1 it is expected * 3 + 1 for Local case + # expected * 3 + 4 for Cluster case, because each subquery loads mettadata.json + assert int(res) == expected * 3 + (4 if is_cluster else 1) + + event = "S3GetObject" if storage_type == "s3" else "AzureGetObject" + + # Without optimization clickhouse reads all 7 files + check_events(all_data_expected_query_id, event, run_on_cluster, 7) + check_events(const_only_expected_query_id, event, run_on_cluster, 7) + check_events(const_partial_expected_query_id, event, run_on_cluster, 7) + check_events(const_partial2_expected_query_id, event, run_on_cluster, 7) + check_events(count_expected_query_id, event, run_on_cluster, 7) + + # If file has only constant columns it is not read + check_events(all_data_query_id, event, run_on_cluster, 5) # 1-2025, 6-2025 must not be read + check_events(const_only_query_id, event, run_on_cluster, 0) # All must not be read + check_events(const_partial_query_id, event, run_on_cluster, 4) # 1-2025, 6-2025 and 2-2025 must not be read + check_events(const_partial2_query_id, event, run_on_cluster, 3) # 6-2025 must not be read, 1-2024, 1-2025, 2-2025 don't have new column 'name' + check_events(count_query_id, event, run_on_cluster, 0) # All must not be read + + def compare_selects(query): + result_expected = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=0") + result_optimized = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=1") + assert result_expected == result_optimized + + compare_selects(f"SELECT _path,* FROM {creation_expression} ORDER BY ALL") + compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE name_old='vasily' ORDER BY ALL") + compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE ((tag + length(name_old)) % 2 = 1) ORDER BY ALL") \ No newline at end of file diff --git a/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml new file mode 100644 index 000000000000..a8239a28293c --- /dev/null +++ b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml @@ -0,0 +1,7 @@ + + + + 0 + + + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 021e73d39235..2ab756654d7b 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -79,6 +79,7 @@ def started_cluster(): "configs/users.xml", "configs/s3_retry.xml", "configs/sync_insert.xml", + "configs/lock_object_storage_task_distribution_ms.xml", ], ) cluster.add_instance( @@ -136,6 +137,7 @@ def started_cluster(): "configs/s3_retry.xml", "configs/process_archives_as_whole_with_cluster.xml", "configs/sync_insert.xml", + "configs/lock_object_storage_task_distribution_ms.xml", ], ) cluster.add_instance( diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 253029f72e25..ec8c1b6aa923 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -162,6 +162,7 @@ SYSTEM MERGES ['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP MERGES','START M SYSTEM TTL MERGES ['SYSTEM STOP TTL MERGES','SYSTEM START TTL MERGES','STOP TTL MERGES','START TTL MERGES'] TABLE SYSTEM SYSTEM FETCHES ['SYSTEM STOP FETCHES','SYSTEM START FETCHES','STOP FETCHES','START FETCHES'] TABLE SYSTEM SYSTEM MOVES ['SYSTEM STOP MOVES','SYSTEM START MOVES','STOP MOVES','START MOVES'] TABLE SYSTEM +SYSTEM SWARM ['SYSTEM STOP SWARM MODE','SYSTEM START SWARM MODE','STOP SWARM MODE','START SWARM MODE'] GLOBAL SYSTEM SYSTEM PULLING REPLICATION LOG ['SYSTEM STOP PULLING REPLICATION LOG','SYSTEM START PULLING REPLICATION LOG'] TABLE SYSTEM SYSTEM CLEANUP ['SYSTEM STOP CLEANUP','SYSTEM START CLEANUP'] TABLE SYSTEM SYSTEM VIEWS ['SYSTEM REFRESH VIEW','SYSTEM START VIEWS','SYSTEM STOP VIEWS','SYSTEM START VIEW','SYSTEM STOP VIEW','SYSTEM CANCEL VIEW','REFRESH VIEW','START VIEWS','STOP VIEWS','START VIEW','STOP VIEW','CANCEL VIEW'] VIEW SYSTEM diff --git a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql index 718eb63ad923..26048ffe5694 100644 --- a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql +++ b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql @@ -4,5 +4,9 @@ -- However, some settings in the experimental tier are meant to control another experimental feature, and then they can be enabled as long as the feature itself is disabled. -- These are in the exceptions list inside NOT IN. -SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('throw_on_unsupported_query_inside_transaction'); +SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ( + 'throw_on_unsupported_query_inside_transaction', +-- turned ON for Altinity Antalya builds specifically + 'allow_experimental_iceberg_read_optimization' +); SELECT name, value FROM system.merge_tree_settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('remove_rolled_back_parts_immediately'); From 587c292a325f12d35e7be70157f38bb1e11ae717 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com> Date: Fri, 24 Apr 2026 01:59:49 +0200 Subject: [PATCH 2/6] Update SettingsChangesHistory.cpp --- src/Core/SettingsChangesHistory.cpp | 30 +++-------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 776393441e03..7961f095bc5b 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -308,10 +308,10 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() // {"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586 // {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, // {"object_storage_remote_initiator", false, false, "New setting."}, - // {"allow_experimental_iceberg_read_optimization", true, true, "New setting."}, + {"allow_experimental_iceberg_read_optimization", true, true, "New setting."}, // {"object_storage_cluster_join_mode", "allow", "allow", "New setting"}, - // {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."}, - // {"allow_retries_in_cluster_requests", false, false, "New setting"}, + {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."}, + {"allow_retries_in_cluster_requests", false, false, "New setting"}, // {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."}, // {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."}, // {"export_merge_tree_partition_force_export", false, false, "New setting."}, @@ -329,30 +329,6 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() // {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."}, // {"object_storage_cluster", "", "", "Antalya: New setting"}, // {"object_storage_max_nodes", 0, 0, "Antalya: New setting"}, - {"allow_experimental_iceberg_read_optimization", true, true, "New setting."}, - {"object_storage_cluster_join_mode", "allow", "allow", "New setting"}, - {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."}, - {"allow_retries_in_cluster_requests", false, false, "New setting"}, - // {"object_storage_remote_initiator", false, false, "New setting."}, - {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."}, - {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."}, - {"export_merge_tree_partition_force_export", false, false, "New setting."}, - {"export_merge_tree_partition_max_retries", 3, 3, "New setting."}, - {"export_merge_tree_partition_manifest_ttl", 180, 180, "New setting."}, - {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."}, - // {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, - {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."}, - {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."}, - {"enable_alias_marker", true, true, "New setting."}, - // {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"}, - // {"input_format_parquet_verify_checksums", true, true, "New setting."}, - // {"output_format_parquet_write_checksums", false, true, "New setting."}, - {"export_merge_tree_part_max_bytes_per_file", 0, 0, "New setting."}, - {"export_merge_tree_part_max_rows_per_file", 0, 0, "New setting."}, - // {"cluster_table_function_split_granularity", "file", "file", "New setting."}, - // {"cluster_table_function_buckets_batch_size", 0, 0, "New setting."}, - {"export_merge_tree_part_throw_on_pending_mutations", true, true, "New setting."}, - {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."}, }); addSettingsChanges(settings_changes_history, "25.8", { From 3be71969a0764c4ce8b44989cd4ec95fd93b18af Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Fri, 24 Apr 2026 16:36:12 +0200 Subject: [PATCH 3/6] Use value_bounds instead of ColumnInfo::hyperrectangle for min/max Removes the `hyperrectangle` field from `DB::Iceberg::ColumnInfo` that was re-added during the frontport. The field was removed upstream in PR https://github.com/ClickHouse/ClickHouse/pull/98231, which relocated raw min/max bounds to `ParsedManifestFileEntry::value_bounds`. The `DataFileMetaInfo` Iceberg constructor now deserializes those bounds via the shared `deserializeFieldFromBinaryRepr` helper (moved from `ManifestFileIterator.cpp` to `IcebergFieldParseHelpers`). Addresses @ianton-ru's comment at https://github.com/Altinity/ClickHouse/pull/1687#discussion_r3137852430. --- .../DataLakes/IDataLakeMetadata.cpp | 43 +++++++++- .../DataLakes/IDataLakeMetadata.h | 3 +- .../Iceberg/IcebergFieldParseHelpers.cpp | 73 +++++++++++++++++ .../Iceberg/IcebergFieldParseHelpers.h | 7 ++ .../DataLakes/Iceberg/IcebergIterator.cpp | 3 +- .../DataLakes/Iceberg/ManifestFile.h | 2 - .../Iceberg/ManifestFileIterator.cpp | 81 +------------------ 7 files changed, 124 insertions(+), 88 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index c0fb681f9595..2dae8857d04c 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include @@ -100,31 +102,64 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat( DataFileMetaInfo::DataFileMetaInfo( const Iceberg::IcebergSchemaProcessor & schema_processor, Int32 schema_id, - const std::unordered_map & columns_info_) + const std::unordered_map & columns_info_, + const std::unordered_map> & value_bounds_) { - +#if USE_AVRO std::vector column_ids; for (const auto & column : columns_info_) column_ids.push_back(column.first); auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids); std::unordered_map name_by_index; + std::unordered_map type_by_index; for (const auto & name_and_type : name_and_types) { const auto name = name_and_type.getNameInStorage(); auto index = schema_processor.tryGetColumnIDByName(schema_id, name); if (index.has_value()) + { name_by_index[index.value()] = name; + type_by_index[index.value()] = name_and_type.type; + } } for (const auto & column : columns_info_) { auto i_name = name_by_index.find(column.first); - if (i_name != name_by_index.end()) + if (i_name == name_by_index.end()) + continue; + + std::optional hyperrectangle; + + auto i_bounds = value_bounds_.find(column.first); + auto i_type = type_by_index.find(column.first); + if (i_bounds != value_bounds_.end() && i_type != type_by_index.end()) { - columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, column.second.hyperrectangle}; + const auto & type = i_type->second; + if (const auto type_id = type->getTypeId(); + type_id != TypeIndex::Tuple && type_id != TypeIndex::Map && type_id != TypeIndex::Array) + { + String left_str; + String right_str; + if (i_bounds->second.first.tryGet(left_str) && i_bounds->second.second.tryGet(right_str)) + { + auto left = Iceberg::deserializeFieldFromBinaryRepr(left_str, type, true); + auto right = Iceberg::deserializeFieldFromBinaryRepr(right_str, type, false); + if (left && right) + hyperrectangle = DB::Range(*left, true, *right, true); + } + } } + + columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, hyperrectangle}; } +#else + (void)schema_processor; + (void)schema_id; + (void)columns_info_; + (void)value_bounds_; +#endif } DataFileMetaInfo::DataFileMetaInfo(Poco::JSON::Object::Ptr file_info) diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index 945618a97ad1..a7fb48a8a2b5 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -63,7 +63,8 @@ class DataFileMetaInfo explicit DataFileMetaInfo( const Iceberg::IcebergSchemaProcessor & schema_processor, Int32 schema_id, - const std::unordered_map & columns_info_); + const std::unordered_map & columns_info_, + const std::unordered_map> & value_bounds_); void serialize(WriteBuffer & out) const; static DataFileMetaInfo deserialize(ReadBuffer & in); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp index 0874b663303a..1e84302758c4 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp @@ -4,7 +4,10 @@ #include #include +#include #include +#include +#include #include #include @@ -98,6 +101,76 @@ std::vector fieldToInt64Array(const Field & value, std::string_view conte return result; } +std::optional deserializeFieldFromBinaryRepr(const std::string & str, DataTypePtr expected_type, bool lower_bound) +{ + auto non_nullable_type = removeNullable(expected_type); + auto column = non_nullable_type->createColumn(); + if (WhichDataType(non_nullable_type).isDecimal()) + { + /// Iceberg store decimal values as unscaled value with two's-complement big-endian binary + /// using the minimum number of bytes for the value + /// Our decimal binary representation is little endian + /// so we cannot reuse our default code for parsing it. + int64_t unscaled_value = 0; + + // Convert from big-endian to signed int + for (const auto byte : str) + unscaled_value = (unscaled_value << 8) | static_cast(byte); + + /// Add sign + if (str[0] & 0x80) + { + int64_t sign_extension = -1; + sign_extension <<= (str.size() * 8); + unscaled_value |= sign_extension; + } + + /// NOTE: It's very weird, but Decimal values for lower bound and upper bound + /// are stored rounded, without fractional part. What is more strange + /// the integer part is rounded mathematically correctly according to fractional part. + /// Example: 17.22 -> 17, 8888.999 -> 8889, 1423.77 -> 1424. + /// I've checked two implementations: Spark and Amazon Athena and both of them + /// do this. + /// + /// The problem is -- we cannot use rounded values for lower bounds and upper bounds. + /// Example: upper_bound(x) = 17.22, but it's rounded 17.00, now condition WHERE x >= 17.21 will + /// check rounded value and say: "Oh largest value is 17, so values bigger than 17.21 cannot be in this file, + /// let's skip it". But it will produce incorrect result since actual value (17.22 >= 17.21) is stored in this file. + /// + /// To handle this issue we subtract 1 from the integral part for lower_bound and add 1 to integral + /// part of upper_bound. This produces: 17.22 -> [16.0, 18.0]. So this is more rough boundary, + /// but at least it doesn't lead to incorrect results. + if (int32_t scale = getDecimalScale(*non_nullable_type)) + { + int64_t scaler = lower_bound ? -10 : 10; + while (--scale) + scaler *= 10; + + unscaled_value += scaler; + } + + if (const auto * decimal_type = checkDecimal(*non_nullable_type)) + { + DecimalField result(static_cast(unscaled_value), decimal_type->getScale()); + return result; + } + if (const auto * decimal_type = checkDecimal(*non_nullable_type)) + { + DecimalField result(unscaled_value, decimal_type->getScale()); + return result; + } + + return std::nullopt; + } + + /// For all other types except decimal binary representation + /// matches our internal representation + column->insertData(str.data(), str.length()); + Field result; + column->get(0, result); + return result; +} + } } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h index 285062cb3d4f..21156647fd4b 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h @@ -4,11 +4,14 @@ #if USE_AVRO +#include +#include #include #include #include #include +#include namespace DB::Iceberg { @@ -25,6 +28,10 @@ Int64 fieldToPeriodMs(const Field & value, std::string_view context, std::string /// Convert a Field containing an Array to vector, validating each element. std::vector fieldToInt64Array(const Field & value, std::string_view context, std::string_view arg_name); +/// Deserialize a single lower/upper bound value from Iceberg's binary representation. +/// See https://iceberg.apache.org/spec/#appendix-d-single-value-serialization +std::optional deserializeFieldFromBinaryRepr(const std::string & str, DataTypePtr expected_type, bool lower_bound); + } #endif diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index bf22cc15ac6e..7de8317abad2 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -397,7 +397,8 @@ ObjectInfoPtr IcebergIterator::next(size_t) object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared( *persistent_components.schema_processor, table_schema_id, /// current schema id to use current column names - manifest_file_entry->parsed_entry->columns_infos)); + manifest_file_entry->parsed_entry->columns_infos, + manifest_file_entry->parsed_entry->value_bounds)); ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos); return object_info; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h index c7453440512e..3a1bccc53411 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h @@ -1,7 +1,6 @@ #pragma once #include "config.h" -#include #include #include @@ -14,7 +13,6 @@ struct ColumnInfo std::optional rows_count; std::optional bytes_size; std::optional nulls_count; - std::optional hyperrectangle; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp index 9142b458d816..d68fa264e0f1 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -45,86 +46,6 @@ namespace DB::Iceberg using namespace DB; -namespace -{ - /// Iceberg stores lower_bounds and upper_bounds serialized with some custom deserialization as bytes array - /// https://iceberg.apache.org/spec/#appendix-d-single-value-serialization - std::optional deserializeFieldFromBinaryRepr(std::string str, DB::DataTypePtr expected_type, bool lower_bound) - { - auto non_nullable_type = DB::removeNullable(expected_type); - auto column = non_nullable_type->createColumn(); - if (DB::WhichDataType(non_nullable_type).isDecimal()) - { - /// Iceberg store decimal values as unscaled value with two's-complement big-endian binary - /// using the minimum number of bytes for the value - /// Our decimal binary representation is little endian - /// so we cannot reuse our default code for parsing it. - int64_t unscaled_value = 0; - - // Convert from big-endian to signed int - for (const auto byte : str) - unscaled_value = (unscaled_value << 8) | static_cast(byte); - - /// Add sign - if (str[0] & 0x80) - { - int64_t sign_extension = -1; - sign_extension <<= (str.size() * 8); - unscaled_value |= sign_extension; - } - - /// NOTE: It's very weird, but Decimal values for lower bound and upper bound - /// are stored rounded, without fractional part. What is more strange - /// the integer part is rounded mathematically correctly according to fractional part. - /// Example: 17.22 -> 17, 8888.999 -> 8889, 1423.77 -> 1424. - /// I've checked two implementations: Spark and Amazon Athena and both of them - /// do this. - /// - /// The problem is -- we cannot use rounded values for lower bounds and upper bounds. - /// Example: upper_bound(x) = 17.22, but it's rounded 17.00, now condition WHERE x >= 17.21 will - /// check rounded value and say: "Oh largest value is 17, so values bigger than 17.21 cannot be in this file, - /// let's skip it". But it will produce incorrect result since actual value (17.22 >= 17.21) is stored in this file. - /// - /// To handle this issue we subtract 1 from the integral part for lower_bound and add 1 to integral - /// part of upper_bound. This produces: 17.22 -> [16.0, 18.0]. So this is more rough boundary, - /// but at least it doesn't lead to incorrect results. - if (int32_t scale = DB::getDecimalScale(*non_nullable_type)) - { - int64_t scaler = lower_bound ? -10 : 10; - while (--scale) - scaler *= 10; - - unscaled_value += scaler; - } - - if (const auto * decimal_type = DB::checkDecimal(*non_nullable_type)) - { - DB::DecimalField result(static_cast(unscaled_value), decimal_type->getScale()); - return result; - } - if (const auto * decimal_type = DB::checkDecimal(*non_nullable_type)) - { - DB::DecimalField result(unscaled_value, decimal_type->getScale()); - return result; - } - else - { - return std::nullopt; - } - } - else - { - /// For all other types except decimal binary representation - /// matches our internal representation - column->insertData(str.data(), str.length()); - DB::Field result; - column->get(0, result); - return result; - } - } - -} - const std::vector & ManifestFileIterator::ManifestFileEntriesHandle::getFilesWithoutDeleted(FileContentType content_type) const { From 741c2598e072c2e6f5e27aea88d3a941b614d0b6 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Mon, 30 Mar 2026 12:44:24 +0200 Subject: [PATCH 4/6] Fix row policies silently ignored on Iceberg tables with PREWHERE enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Iceberg read optimization (`allow_experimental_iceberg_read_optimization`) identifies constant columns from Iceberg metadata and removes them from the read request. When all requested columns become constant, it sets `need_only_count = true`, which tells the Parquet reader to skip all initialization — including `preparePrewhere` — and just return the raw row count from file metadata. This completely bypasses `row_level_filter` (row policies) and `prewhere_info`, returning unfiltered row counts. The InterpreterSelectQuery relies on the storage to apply these filters when `supportsPrewhere` is true and does not add a fallback FilterStep to the query plan, so the filter is silently lost. The fix prevents `need_only_count` from being set when an active `row_level_filter` or `prewhere_info` exists in the format filter info. Fixes #1595 (cherry picked from commit f204850f4a481f9f9478cd5ff98d2769f6271439) --- src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 1f3e1e7419ba..658c3cd6525f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -761,7 +761,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (requested_columns_copy.size() + constant_columns.size() != original_columns) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't remove constant columns for file {} correct, fallback to read. Founded constant columns: [{}]", object_info->getPath(), constant_columns); - if (requested_columns_copy.empty()) + if (requested_columns_copy.empty() + && (!format_filter_info || (!format_filter_info->row_level_filter && !format_filter_info->prewhere_info))) need_only_count = true; } } From 037825f31f59c54918b8be08ca3bedd3f8ab8292 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 31 Mar 2026 14:30:26 +0200 Subject: [PATCH 5/6] Fix Iceberg read optimization replacing prewhere columns with constant NULLs The Altinity-specific constant column optimization (`allow_experimental_iceberg_read_optimization`) scans `requested_columns` for nullable columns absent from the Iceberg file metadata and replaces them with constant NULLs. However, `requested_columns` can also contain columns produced by `prewhere_info` or `row_level_filter` expressions (e.g. `equals(boolean_col, false)`). These computed columns are not in the file metadata, and their result type is often `Nullable(UInt8)`, so the optimization incorrectly treats them as missing file columns and replaces them with NULLs. This corrupts the prewhere pipeline: the Parquet reader evaluates the filter expression correctly, but the constant column optimization then overwrites the result with NULLs. With `need_filter = false` (old planner, PREWHERE + WHERE), all rows appear to fail the filter, producing empty output. With `need_filter = true`, the filter column is NULL so all rows are filtered out. The fix skips columns that match the `prewhere_info` or `row_level_filter` column names, since these are computed at read time and never stored in the file. (cherry picked from commit b7696a30dbf8064963b5e75ec45f5882e55bb7de) --- src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 658c3cd6525f..895c964422f4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -739,6 +739,13 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (!column.second.second.type->isNullable()) continue; + /// Skip columns produced by prewhere or row-level filter expressions — + /// they are computed at read time, not stored in the file. + if (format_filter_info + && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name) + || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name))) + continue; + /// Column is nullable and absent in file constant_columns_with_values[column.second.first] = ConstColumnWithValue{ From 5959fbdd1ec7b13674b6ff8c87447d2367508150 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Mon, 27 Apr 2026 11:43:09 +0200 Subject: [PATCH 6/6] Decode Iceberg value_bounds with the file's schema, not the table's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DataFileMetaInfo::DataFileMetaInfo` (Iceberg constructor introduced in 3be71969a07) deserialized `value_bounds` using the table's current schema. After schema evolution (e.g. `int` -> `long`) the bytes were still encoded with the file's old type — a 4-byte int — but were read as 8 bytes for `Int64`. `ColumnVector::insertData` ignores the length argument and always reads `sizeof(T)` bytes via `unalignedLoad`, so the extra 4 bytes came from adjacent memory and produced a garbage hyperrectangle. The garbage range often satisfied `Range::isPoint`, which made the iceberg read optimization replace the column with a constant value taken from the garbage bound, corrupting query results. Pass the file's `resolved_schema_id` separately so types are looked up against the schema the data file was written with, while column names keep coming from the current table schema (so the resulting `columns_info` map is keyed by names callers know about). Reproducer: `test_storage_iceberg_schema_evolution/test_evolved_schema_simple.py::test_evolved_schema_simple` — all 12 parametrizations failed at the assertion after `ALTER COLUMN a TYPE BIGINT`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../DataLakes/IDataLakeMetadata.cpp | 27 +++++++++++++------ .../DataLakes/IDataLakeMetadata.h | 12 +++++++-- .../DataLakes/Iceberg/IcebergIterator.cpp | 1 + 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index 2dae8857d04c..e553907b4a04 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -101,7 +101,8 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat( DataFileMetaInfo::DataFileMetaInfo( const Iceberg::IcebergSchemaProcessor & schema_processor, - Int32 schema_id, + Int32 table_schema_id, + Int32 file_schema_id, const std::unordered_map & columns_info_, const std::unordered_map> & value_bounds_) { @@ -110,18 +111,27 @@ DataFileMetaInfo::DataFileMetaInfo( for (const auto & column : columns_info_) column_ids.push_back(column.first); - auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids); + /// Names are resolved via the table schema so that the resulting `columns_info` + /// map is keyed by the current column names that callers know about. + auto table_name_and_types = schema_processor.tryGetFieldsCharacteristics(table_schema_id, column_ids); std::unordered_map name_by_index; - std::unordered_map type_by_index; - for (const auto & name_and_type : name_and_types) + for (const auto & name_and_type : table_name_and_types) { const auto name = name_and_type.getNameInStorage(); - auto index = schema_processor.tryGetColumnIDByName(schema_id, name); + auto index = schema_processor.tryGetColumnIDByName(table_schema_id, name); if (index.has_value()) - { name_by_index[index.value()] = name; + } + + /// Types come from the file's schema because `value_bounds_` are encoded with + /// that schema's column types — see Iceberg single-value serialization spec. + std::unordered_map type_by_index; + auto file_name_and_types = schema_processor.tryGetFieldsCharacteristics(file_schema_id, column_ids); + for (const auto & name_and_type : file_name_and_types) + { + auto index = schema_processor.tryGetColumnIDByName(file_schema_id, name_and_type.getNameInStorage()); + if (index.has_value()) type_by_index[index.value()] = name_and_type.type; - } } for (const auto & column : columns_info_) @@ -156,7 +166,8 @@ DataFileMetaInfo::DataFileMetaInfo( } #else (void)schema_processor; - (void)schema_id; + (void)table_schema_id; + (void)file_schema_id; (void)columns_info_; (void)value_bounds_; #endif diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index a7fb48a8a2b5..1a43b9f20a85 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -59,10 +59,18 @@ class DataFileMetaInfo std::optional hyperrectangle; }; - // Extract metadata from Iceberg structure + // Extract metadata from Iceberg structure. + // table_schema_id is the current table schema, used to resolve column names that + // appear as keys in the resulting `columns_info` map. + // file_schema_id is the schema the data file (and its `value_bounds_`) was written + // with — bounds bytes are encoded with that schema's column types, so they must be + // deserialized using those types. After schema evolution (e.g. `int` -> `long`) + // the two ids differ, and using the table schema's type would misinterpret the + // bytes and produce a garbage hyperrectangle. explicit DataFileMetaInfo( const Iceberg::IcebergSchemaProcessor & schema_processor, - Int32 schema_id, + Int32 table_schema_id, + Int32 file_schema_id, const std::unordered_map & columns_info_, const std::unordered_map> & value_bounds_); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index 7de8317abad2..97de3771264e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -397,6 +397,7 @@ ObjectInfoPtr IcebergIterator::next(size_t) object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared( *persistent_components.schema_processor, table_schema_id, /// current schema id to use current column names + manifest_file_entry->resolved_schema_id, /// file's schema id to interpret value_bounds bytes manifest_file_entry->parsed_entry->columns_infos, manifest_file_entry->parsed_entry->value_bounds)); ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos);