From 1063baac3bf9f3b0b28c2ed4543adf8281031111 Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com>
Date: Thu, 5 Mar 2026 16:16:52 +0300
Subject: [PATCH 1/6] Merge pull request #1414 from
 Altinity/frontport/antalya-26.1/rendezvous_hashing

26.1 Antalya port - improvements for cluster requests
---
 docs/en/sql-reference/statements/system.md    |   6 +
 programs/server/Server.cpp                    |   4 +
 src/Access/Common/AccessType.h                |   1 +
 src/Client/MultiplexedConnections.cpp         |   7 +-
 src/Common/CurrentMetrics.cpp                 |   1 +
 src/Common/ProfileEvents.cpp                  |   5 +
 src/Core/Protocol.h                           |   2 +
 src/Core/Range.cpp                            |  41 +++
 src/Core/Range.h                              |   5 +
 src/Core/Settings.cpp                         |  19 +-
 src/Core/SettingsChangesHistory.cpp           |  24 ++
 .../ObjectStorages/IObjectStorage.cpp         |  56 ++++
 .../ObjectStorages/IObjectStorage.h           |  66 ++++-
 src/Interpreters/ClusterDiscovery.cpp         |  62 ++++-
 src/Interpreters/ClusterDiscovery.h           |  13 +
 src/Interpreters/ClusterFunctionReadTask.cpp  |  14 +-
 src/Interpreters/ClusterFunctionReadTask.h    |   2 +
 src/Interpreters/Context.cpp                  |  45 ++-
 src/Interpreters/Context.h                    |  11 +
 src/Interpreters/InterpreterSystemQuery.cpp   |  20 ++
 src/Parsers/ASTSystemQuery.cpp                |   2 +
 src/Parsers/ASTSystemQuery.h                  |   2 +
 src/Processors/Chunk.cpp                      |   7 +-
 src/Processors/Sources/ConstChunkGenerator.h  |   9 +-
 src/QueryPipeline/RemoteQueryExecutor.cpp     |  22 ++
 src/QueryPipeline/RemoteQueryExecutor.h       |  21 +-
 .../RemoteQueryExecutorReadContext.cpp        |  44 ++-
 .../RemoteQueryExecutorReadContext.h          |   5 +-
 .../DataLakes/IDataLakeMetadata.cpp           | 191 +++++++++++++
 .../DataLakes/IDataLakeMetadata.h             |  63 ++++-
 .../DataLakes/Iceberg/IcebergIterator.cpp     |   5 +
 .../DataLakes/Iceberg/IcebergIterator.h       |   3 +-
 .../DataLakes/Iceberg/IcebergMetadata.cpp     |   4 +-
 .../DataLakes/Iceberg/ManifestFile.h          |  24 +-
 .../StorageObjectStorageCluster.cpp           |  68 ++++-
 .../StorageObjectStorageSource.cpp            | 207 +++++++++++++-
 .../StorageObjectStorageSource.h              |  14 +-
 ...rageObjectStorageStableTaskDistributor.cpp | 166 +++++++++--
 ...torageObjectStorageStableTaskDistributor.h |  22 +-
 .../tests/gtest_rendezvous_hashing.cpp        |  18 +-
 src/Storages/StorageFileCluster.cpp           |  45 ++-
 src/Storages/StorageURLCluster.cpp            |  40 ++-
 .../test_s3_cache_locality/__init__.py        |   0
 .../configs/cluster.xml                       | 126 +++++++++
 .../configs/named_collections.xml             |  10 +
 .../test_s3_cache_locality/configs/users.xml  |   9 +
 .../test_s3_cache_locality/test.py            | 262 ++++++++++++++++++
 .../test_s3_cluster/data/graceful/part0.csv   |   1 +
 .../test_s3_cluster/data/graceful/part1.csv   |   1 +
 .../test_s3_cluster/data/graceful/part2.csv   |   1 +
 .../test_s3_cluster/data/graceful/part3.csv   |   1 +
 .../test_s3_cluster/data/graceful/part4.csv   |   1 +
 .../test_s3_cluster/data/graceful/part5.csv   |   1 +
 .../test_s3_cluster/data/graceful/part6.csv   |   1 +
 .../test_s3_cluster/data/graceful/part7.csv   |   1 +
 .../test_s3_cluster/data/graceful/part8.csv   |   1 +
 .../test_s3_cluster/data/graceful/part9.csv   |   1 +
 .../test_s3_cluster/data/graceful/partA.csv   |   1 +
 .../test_s3_cluster/data/graceful/partB.csv   |   1 +
 .../test_s3_cluster/data/graceful/partC.csv   |   1 +
 .../test_s3_cluster/data/graceful/partD.csv   |   1 +
 .../test_s3_cluster/data/graceful/partE.csv   |   1 +
 .../test_s3_cluster/data/graceful/partF.csv   |   1 +
 tests/integration/test_s3_cluster/test.py     |  81 ++++++
 ...test_read_constant_columns_optimization.py | 215 ++++++++++++++
 ...ck_object_storage_task_distribution_ms.xml |   7 +
 tests/integration/test_storage_s3/test.py     |   2 +
 .../01271_show_privileges.reference           |   1 +
 ..._settings_cannot_be_enabled_by_default.sql |   6 +-
 69 files changed, 2016 insertions(+), 105 deletions(-)
 create mode 100644 tests/integration/test_s3_cache_locality/__init__.py
 create mode 100644 tests/integration/test_s3_cache_locality/configs/cluster.xml
 create mode 100644 tests/integration/test_s3_cache_locality/configs/named_collections.xml
 create mode 100644 tests/integration/test_s3_cache_locality/configs/users.xml
 create mode 100644 tests/integration/test_s3_cache_locality/test.py
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part0.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part1.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part2.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part3.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part4.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part5.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part6.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part7.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part8.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/part9.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partA.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partB.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partC.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partD.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partE.csv
 create mode 100644 tests/integration/test_s3_cluster/data/graceful/partF.csv
 create mode 100644 tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py
 create mode 100644 tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml

diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md
index 65521ba942f0..933bd4d32aca 100644
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@@ -224,6 +224,12 @@ Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$
 
 Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`)
 
+## SYSTEM PRESHUTDOWN {#preshutdown}
+
+<CloudNotSupportedBadge/>
+
+Prepare node for graceful shutdown. Unregister in autodiscovered clusters, stop accepting distributed requests to object storages (s3Cluster, icebergCluster, etc.).
+
 ## SYSTEM INSTRUMENT {#instrument}
 
 Manages instrumentation points using LLVM's XRay feature which is available when ClickHouse is built using `ENABLE_XRAY=1`.
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 4e4ba58226cd..7b45a62f4834 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -2607,6 +2607,8 @@ try
 
     }
 
+    global_context->startSwarmMode();
+
     {
         std::lock_guard lock(servers_lock);
         /// We should start interserver communications before (and more important shutdown after) tables.
@@ -3094,6 +3096,8 @@ try
 
             is_cancelled = true;
 
+            global_context->stopSwarmMode();
+
             LOG_DEBUG(log, "Waiting for current connections to close.");
 
             size_t current_connections = 0;
diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h
index 3ae01fb44fce..4ffbfebfbd47 100644
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@@ -352,6 +352,7 @@ enum class AccessType : uint8_t
     M(SYSTEM_TTL_MERGES, "SYSTEM STOP TTL MERGES, SYSTEM START TTL MERGES, STOP TTL MERGES, START TTL MERGES", TABLE, SYSTEM) \
     M(SYSTEM_FETCHES, "SYSTEM STOP FETCHES, SYSTEM START FETCHES, STOP FETCHES, START FETCHES", TABLE, SYSTEM) \
     M(SYSTEM_MOVES, "SYSTEM STOP MOVES, SYSTEM START MOVES, STOP MOVES, START MOVES", TABLE, SYSTEM) \
+    M(SYSTEM_SWARM, "SYSTEM STOP SWARM MODE, SYSTEM START SWARM MODE, STOP SWARM MODE, START SWARM MODE", GLOBAL, SYSTEM) \
     M(SYSTEM_PULLING_REPLICATION_LOG, "SYSTEM STOP PULLING REPLICATION LOG, SYSTEM START PULLING REPLICATION LOG", TABLE, SYSTEM) \
     M(SYSTEM_CLEANUP, "SYSTEM STOP CLEANUP, SYSTEM START CLEANUP", TABLE, SYSTEM) \
     M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, CANCEL VIEW", VIEW, SYSTEM) \
diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp
index c4c5010df342..3b1a09631dd9 100644
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@@ -232,7 +232,7 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuid
 void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFunctionReadTaskResponse & response)
 {
     std::lock_guard lock(cancel_mutex);
-    if (cancelled)
+    if (cancelled || !current_connection || !current_connection->isConnected())
         return;
     current_connection->sendClusterFunctionReadTaskResponse(response);
 }
@@ -241,7 +241,7 @@ void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFu
 void MultiplexedConnections::sendMergeTreeReadTaskResponse(const ParallelReadResponse & response)
 {
     std::lock_guard lock(cancel_mutex);
-    if (cancelled)
+    if (cancelled || !current_connection || !current_connection->isConnected())
         return;
     current_connection->sendMergeTreeReadTaskResponse(response);
 }
@@ -527,9 +527,12 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
 
 void MultiplexedConnections::invalidateReplica(ReplicaState & state)
 {
+    Connection * old_connection = state.connection;
     state.connection = nullptr;
     state.pool_entry = IConnectionPool::Entry();
     --active_connection_count;
+    if (current_connection == old_connection)
+        current_connection = nullptr;
 }
 
 void MultiplexedConnections::setAsyncCallback(AsyncCallback async_callback)
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 84d3b3c5bd0e..f398128cfebf 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -483,6 +483,7 @@
     M(StartupScriptsExecutionState, "State of startup scripts execution: 0 = not finished, 1 = success, 2 = failure.") \
     \
     M(IsServerShuttingDown, "Indicates if the server is shutting down: 0 = no, 1 = yes") \
+    M(IsSwarmModeEnabled, "Indicates if the swarm mode enabled or not: 0 = disabled, 1 = enabled") \
     \
     M(StatelessWorkerThreads, "Number of threads in the stateless worker thread pool.") \
     M(StatelessWorkerThreadsActive, "Number of threads in the stateless worker thread pool running a task.") \
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 9872e039abfa..87368fd76272 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -1377,6 +1377,11 @@ The server successfully detected this situation and will download merged part fr
     M(RuntimeFilterRowsChecked, "Number of rows checked by JOIN Runtime Filters", ValueType::Number) \
     M(RuntimeFilterRowsPassed, "Number of rows that passed (not filtered out by) JOIN Runtime Filters", ValueType::Number) \
     M(RuntimeFilterRowsSkipped, "Number of rows in blocks that were skipped by JOIN Runtime Filters", ValueType::Number) \
+    \
+    M(ObjectStorageClusterSentToMatchedReplica, "Number of tasks in ObjectStorageCluster request sent to matched replica.", ValueType::Number) \
+    M(ObjectStorageClusterSentToNonMatchedReplica, "Number of tasks in ObjectStorageCluster request sent to non-matched replica.", ValueType::Number) \
+    M(ObjectStorageClusterProcessedTasks, "Number of processed tasks in ObjectStorageCluster request.", ValueType::Number) \
+    M(ObjectStorageClusterWaitingMicroseconds, "Time of waiting for tasks in ObjectStorageCluster request.", ValueType::Microseconds) \
 
 
 #ifdef APPLY_FOR_EXTERNAL_EVENTS
diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h
index d8bc5be252e9..dcb83cd33a80 100644
--- a/src/Core/Protocol.h
+++ b/src/Core/Protocol.h
@@ -96,8 +96,10 @@ namespace Protocol
             MergeTreeReadTaskRequest = 16,  /// Request from a MergeTree replica to a coordinator
             TimezoneUpdate = 17,            /// Receive server's (session-wide) default timezone
             SSHChallenge = 18,              /// Return challenge for SSH signature signing
+
             MAX = SSHChallenge,
 
+            ConnectionLost = 255,           /// Exception that occurred on the client side.
         };
 
         /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10
diff --git a/src/Core/Range.cpp b/src/Core/Range.cpp
index 139fb8db76c9..766c2d4719c8 100644
--- a/src/Core/Range.cpp
+++ b/src/Core/Range.cpp
@@ -2,13 +2,21 @@
 #include <Core/Range.h>
 #include <IO/Operators.h>
 #include <IO/WriteBufferFromString.h>
+#include <IO/ReadBufferFromString.h>
 #include <Common/FieldVisitorToString.h>
 #include <Common/FieldAccurateComparison.h>
+#include <Common/Base64.h>
 
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+};
+
+
 FieldRef::FieldRef(ColumnsWithTypeAndName * columns_, size_t row_idx_, size_t column_idx_)
     : Field((*(*columns_)[column_idx_].column)[row_idx_]), columns(columns_), row_idx(row_idx_), column_idx(column_idx_)
 {
@@ -151,6 +159,13 @@ bool Range::isInfinite() const
     return left.isNegativeInfinity() && right.isPositiveInfinity();
 }
 
+/// [x, x]
+bool Range::isPoint() const
+{
+    return fullBounded() && left_included && right_included && equals(left, right)
+        && !left.isNegativeInfinity() && !left.isPositiveInfinity();
+}
+
 bool Range::intersectsRange(const Range & r) const
 {
     /// r to the left of me.
@@ -276,6 +291,32 @@ bool Range::nearByWith(const Range & r) const
     return false;
 }
 
+String Range::serialize(bool base64) const
+{
+    WriteBufferFromOwnString str;
+
+    str << left_included << right_included;
+    writeFieldBinary(left, str);
+    writeFieldBinary(right, str);
+
+    if (base64)
+        return base64Encode(str.str());
+    else
+        return str.str();
+}
+
+void Range::deserialize(const String & range, bool base64)
+{
+    if (range.empty())
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Empty range dump");
+
+    ReadBufferFromOwnString str(base64 ? base64Decode(range) : range);
+
+    str >> left_included >> right_included;
+    left = readFieldBinary(str);
+    right = readFieldBinary(str);
+}
+
 Range intersect(const Range & a, const Range & b)
 {
     Range res = Range::createWholeUniverse();
diff --git a/src/Core/Range.h b/src/Core/Range.h
index 6072795db0a9..059b7470d1d7 100644
--- a/src/Core/Range.h
+++ b/src/Core/Range.h
@@ -94,6 +94,8 @@ struct Range
 
     bool isBlank() const;
 
+    bool isPoint() const;
+
     bool intersectsRange(const Range & r) const;
 
     bool containsRange(const Range & r) const;
@@ -114,6 +116,9 @@ struct Range
     bool nearByWith(const Range & r) const;
 
     String toString() const;
+
+    String serialize(bool base64 = false) const;
+    void deserialize(const String & range, bool base64 = false);
 };
 
 Range intersect(const Range & a, const Range & b);
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 0cbb8921adc8..f239be8a09c0 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7665,6 +7665,19 @@ Default number of tasks for parallel reading in distributed query. Tasks are spr
     DECLARE(Bool, distributed_plan_optimize_exchanges, true, R"(
 Removes unnecessary exchanges in distributed query plan. Disable it for debugging.
 )", 0) \
+    DECLARE(UInt64, lock_object_storage_task_distribution_ms, 500, R"(
+In object storage distribution queries do not distribute tasks on non-prefetched nodes until prefetched node is active.
+Determines how long the free executor node (one that finished processing all of it assigned tasks) should wait before "stealing" tasks from queue of currently busy executor nodes.
+
+Possible values:
+
+- 0  - steal tasks immediately after freeing up.
+- >0 - wait for specified period of time before stealing tasks.
+
+Having this `>0` helps with cache reuse and might improve overall query time.
+Because busy node might have warmed-up caches for this specific task, while free node needs to fetch lots of data from S3.
+Which might take longer than just waiting for the busy node and generate extra traffic.
+)", EXPERIMENTAL) \
     DECLARE(String, distributed_plan_force_exchange_kind, "", R"(
 Force specified kind of Exchange operators between distributed query stages.
 
@@ -7712,6 +7725,9 @@ If the number of set bits in a runtime bloom filter exceeds this ratio the filte
 )", EXPERIMENTAL) \
     DECLARE(Bool, rewrite_in_to_join, false, R"(
 Rewrite expressions like 'x IN subquery' to JOIN. This might be useful for optimizing the whole query with join reordering.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_iceberg_read_optimization, true, R"(
+Allow Iceberg read optimization based on Iceberg metadata.
 )", EXPERIMENTAL) \
     \
     /** Experimental timeSeries* aggregate functions. */ \
@@ -7880,7 +7896,8 @@ Maximum number of WebAssembly UDF instances that can run in parallel per functio
     MAKE_OBSOLETE(M, Bool, describe_extend_object_types, false) \
     MAKE_OBSOLETE(M, Bool, allow_experimental_object_type, false) \
     MAKE_OBSOLETE(M, BoolAuto, insert_select_deduplicate, Field{"auto"}) \
-    MAKE_OBSOLETE(M, Bool, use_text_index_dictionary_cache, false)
+    MAKE_OBSOLETE(M, Bool, use_text_index_dictionary_cache, false) \
+    MAKE_OBSOLETE(M, Bool, allow_retries_in_cluster_requests, false)
     /** The section above is for obsolete settings. Do not add anything there. */
 #endif /// __CLION_IDE__
 
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index ac6d42e957f6..776393441e03 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -329,6 +329,30 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             // {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."},
             // {"object_storage_cluster", "", "", "Antalya: New setting"},
             // {"object_storage_max_nodes", 0, 0, "Antalya: New setting"},
+            {"allow_experimental_iceberg_read_optimization", true, true, "New setting."},
+            {"object_storage_cluster_join_mode", "allow", "allow", "New setting"},
+            {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."},
+            {"allow_retries_in_cluster_requests", false, false, "New setting"},
+        //     {"object_storage_remote_initiator", false, false, "New setting."},
+            {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."},
+            {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."},
+            {"export_merge_tree_partition_force_export", false, false, "New setting."},
+            {"export_merge_tree_partition_max_retries", 3, 3, "New setting."},
+            {"export_merge_tree_partition_manifest_ttl", 180, 180, "New setting."},
+            {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."},
+        //     {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."},
+            {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."},
+            {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."},
+            {"enable_alias_marker", true, true, "New setting."},
+        //     {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"},
+        //     {"input_format_parquet_verify_checksums", true, true, "New setting."},
+        //     {"output_format_parquet_write_checksums", false, true, "New setting."},
+            {"export_merge_tree_part_max_bytes_per_file", 0, 0, "New setting."},
+            {"export_merge_tree_part_max_rows_per_file", 0, 0, "New setting."},
+        //     {"cluster_table_function_split_granularity", "file", "file", "New setting."},
+        //     {"cluster_table_function_buckets_batch_size", 0, 0, "New setting."},
+        {"export_merge_tree_part_throw_on_pending_mutations", true, true, "New setting."},
+        {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."},
         });
         addSettingsChanges(settings_changes_history, "25.8",
         {
diff --git a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp
index 5b2225775247..6c825fd9c21b 100644
--- a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp
+++ b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.cpp
@@ -8,6 +8,11 @@
 #include <Common/Exception.h>
 #include <Common/ObjectStorageKeyGenerator.h>
 #include <IO/WriteBufferFromString.h>
+#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+#include <Poco/JSON/JSONException.h>
 
 
 namespace DB
@@ -102,4 +107,55 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings
     return write_settings;
 }
 
+RelativePathWithMetadata::RelativePathWithMetadata(const DataFileInfo & info, std::optional<ObjectMetadata> metadata_)
+    : metadata(std::move(metadata_))
+{
+    relative_path = info.file_path;
+    file_meta_info = info.file_meta_info;
+}
+
+RelativePathWithMetadata::CommandInTaskResponse::CommandInTaskResponse(const std::string & task)
+{
+    Poco::JSON::Parser parser;
+    try
+    {
+        auto json = parser.parse(task).extract<Poco::JSON::Object::Ptr>();
+        if (!json)
+            return;
+
+        is_valid = true;
+
+        if (json->has("file_path"))
+            file_path = json->getValue<std::string>("file_path");
+        if (json->has("retry_after_us"))
+            retry_after_us = json->getValue<size_t>("retry_after_us");
+        if (json->has("meta_info"))
+            file_meta_info = std::make_shared<DataFileMetaInfo>(json->getObject("meta_info"));
+    }
+    catch (const Poco::JSON::JSONException &)
+    { /// Not a JSON
+        return;
+    }
+    catch (const Poco::SyntaxException &)
+    { /// Not a JSON
+        return;
+    }
+}
+
+std::string RelativePathWithMetadata::CommandInTaskResponse::toString() const
+{
+    Poco::JSON::Object json;
+    if (file_path.has_value())
+        json.set("file_path", file_path.value());
+    if (retry_after_us.has_value())
+        json.set("retry_after_us", retry_after_us.value());
+    if (file_meta_info.has_value())
+        json.set("meta_info", file_meta_info.value()->toJson());
+
+    std::ostringstream oss;
+    oss.exceptions(std::ios::failbit);
+    Poco::JSON::Stringifier::stringify(json, oss);
+    return oss.str();
+}
+
 }
diff --git a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h
index 6610ee6874f9..a6a7bc6696f6 100644
--- a/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/DiskObjectStorage/ObjectStorages/IObjectStorage.h
@@ -110,20 +110,74 @@ struct ObjectMetadata
     ObjectAttributes attributes;
 };
 
+
+struct DataFileInfo;
+class DataFileMetaInfo;
+using DataFileMetaInfoPtr = std::shared_ptr<DataFileMetaInfo>;
+
 struct DataLakeObjectMetadata;
 
 struct RelativePathWithMetadata
 {
+    class CommandInTaskResponse
+    {
+    public:
+        CommandInTaskResponse() = default;
+        explicit CommandInTaskResponse(const std::string & task);
+
+        bool isValid() const { return is_valid; }
+        void setFilePath(const std::string & file_path_ )
+        {
+            file_path = file_path_;
+            is_valid = true;
+        }
+        void setRetryAfterUs(Poco::Timestamp::TimeDiff time_us)
+        {
+            retry_after_us = time_us;
+            is_valid = true;
+        }
+        void setFileMetaInfo(DataFileMetaInfoPtr file_meta_info_ )
+        {
+            file_meta_info = file_meta_info_;
+            is_valid = true;
+        }
+
+        std::string toString() const;
+
+        std::optional<std::string> getFilePath() const { return file_path; }
+        std::optional<Poco::Timestamp::TimeDiff> getRetryAfterUs() const { return retry_after_us; }
+        std::optional<DataFileMetaInfoPtr> getFileMetaInfo() const { return file_meta_info; }
+
+    private:
+        bool is_valid = false;
+        std::optional<std::string> file_path;
+        std::optional<Poco::Timestamp::TimeDiff> retry_after_us;
+        std::optional<DataFileMetaInfoPtr> file_meta_info;
+    };
+
     String relative_path;
     /// Object metadata: size, modification time, etc.
     std::optional<ObjectMetadata> metadata;
+    /// Information about columns
+    std::optional<DataFileMetaInfoPtr> file_meta_info;
+    /// Retry request after short pause
+    CommandInTaskResponse command;
 
     RelativePathWithMetadata() = default;
 
-    explicit RelativePathWithMetadata(String relative_path_, std::optional<ObjectMetadata> metadata_ = std::nullopt)
-        : relative_path(std::move(relative_path_))
+    explicit RelativePathWithMetadata(String command_or_path, std::optional<ObjectMetadata> metadata_ = std::nullopt)
+        : relative_path(std::move(command_or_path))
         , metadata(std::move(metadata_))
-    {}
+        , command(relative_path)
+    {
+        if (command.isValid())
+        {
+            relative_path = command.getFilePath().value_or("");
+            file_meta_info = command.getFileMetaInfo();
+        }
+    }
+
+    explicit RelativePathWithMetadata(const DataFileInfo & info, std::optional<ObjectMetadata> metadata_ = std::nullopt);
 
     RelativePathWithMetadata(const RelativePathWithMetadata & other) = default;
 
@@ -131,6 +185,12 @@ struct RelativePathWithMetadata
 
     std::string getFileName() const { return std::filesystem::path(relative_path).filename(); }
     std::string getPath() const { return relative_path; }
+
+    void setFileMetaInfo(std::optional<DataFileMetaInfoPtr> file_meta_info_ ) { file_meta_info = file_meta_info_; }
+    std::optional<DataFileMetaInfoPtr> getFileMetaInfo() const { return file_meta_info; }
+
+    const CommandInTaskResponse & getCommand() const { return command; }
+    std::string getFileNameWithoutExtension() const { return std::filesystem::path(relative_path).stem(); }
 };
 
 struct ObjectKeyWithMetadata
diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp
index 98f57c31e9e5..844a60a4fba2 100644
--- a/src/Interpreters/ClusterDiscovery.cpp
+++ b/src/Interpreters/ClusterDiscovery.cpp
@@ -152,6 +152,13 @@ class ClusterDiscovery::Flags
         cv.notify_one();
     }
 
+    void wakeup()
+    {
+        std::unique_lock<std::mutex> lk(mu);
+        any_need_update = true;
+        cv.notify_one();
+    }
+
 private:
     std::condition_variable cv;
     std::mutex mu;
@@ -441,7 +448,9 @@ bool ClusterDiscovery::upsertCluster(ClusterInfo & cluster_info)
         return true;
     };
 
-    if (!cluster_info.current_node_is_observer && !contains(node_uuids, current_node_name))
+    if (!cluster_info.current_node_is_observer
+        && context->isSwarmModeEnabled()
+        && !contains(node_uuids, current_node_name))
     {
         LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name);
         registerInZk(zk, cluster_info);
@@ -506,12 +515,30 @@ void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & inf
         return;
     }
 
+    if (!context->isSwarmModeEnabled())
+    {
+        LOG_DEBUG(log, "STOP SWARM MODE called, skip self-registering current node {} in cluster {}", current_node_name, info.name);
+        return;
+    }
+
     LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name);
 
     zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral);
     LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name);
 }
 
+void ClusterDiscovery::unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info)
+{
+    if (info.current_node_is_observer)
+        return;
+
+    String node_path = getShardsListPath(info.zk_root) / current_node_name;
+    LOG_DEBUG(log, "Removing current node {} from cluster {}", current_node_name, info.name);
+
+    zk->remove(node_path);
+    LOG_DEBUG(log, "Current node {} removed from cluster {}", current_node_name, info.name);
+}
+
 void ClusterDiscovery::initialUpdate()
 {
     LOG_DEBUG(log, "Initializing");
@@ -554,6 +581,18 @@ void ClusterDiscovery::initialUpdate()
     is_initialized = true;
 }
 
+void ClusterDiscovery::registerAll()
+{
+    register_change_flag = RegisterChangeFlag::RCF_REGISTER_ALL;
+    clusters_to_update->wakeup();
+}
+
+void ClusterDiscovery::unregisterAll()
+{
+    register_change_flag = RegisterChangeFlag::RCF_UNREGISTER_ALL;
+    clusters_to_update->wakeup();
+}
+
 void ClusterDiscovery::findDynamicClusters(
     std::unordered_map<String, ClusterDiscovery::ClusterInfo> & info,
     std::unordered_set<size_t> * unchanged_roots)
@@ -782,6 +821,27 @@ bool ClusterDiscovery::runMainThread(std::function<void()> up_to_date_callback)
         {
             up_to_date_callback();
         }
+
+        RegisterChangeFlag flag = register_change_flag.exchange(RegisterChangeFlag::RCF_NONE);
+
+        if (flag == RegisterChangeFlag::RCF_REGISTER_ALL)
+        {
+            LOG_DEBUG(log, "Register in all dynamic clusters");
+            for (auto & [_, info] : clusters_info)
+            {
+                auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name);
+                registerInZk(zk, info);
+            }
+        }
+        else if (flag == RegisterChangeFlag::RCF_UNREGISTER_ALL)
+        {
+            LOG_DEBUG(log, "Unregister in all dynamic clusters");
+            for (auto & [_, info] : clusters_info)
+            {
+                auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name);
+                unregisterFromZk(zk, info);
+            }
+        }
     }
     LOG_DEBUG(log, "Worker thread stopped");
     return finished;
diff --git a/src/Interpreters/ClusterDiscovery.h b/src/Interpreters/ClusterDiscovery.h
index 963970b32af2..eee0c157e465 100644
--- a/src/Interpreters/ClusterDiscovery.h
+++ b/src/Interpreters/ClusterDiscovery.h
@@ -38,6 +38,9 @@ class ClusterDiscovery
 
     ~ClusterDiscovery();
 
+    void registerAll();
+    void unregisterAll();
+
 private:
     struct NodeInfo
     {
@@ -112,6 +115,7 @@ class ClusterDiscovery
     void initialUpdate();
 
     void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info);
+    void unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info);
 
     Strings getNodeNames(zkutil::ZooKeeperPtr & zk,
                          const String & zk_root,
@@ -194,6 +198,15 @@ class ClusterDiscovery
     std::vector<MulticlusterDiscovery> multicluster_discovery_paths;
 
     MultiVersion<Macros>::Version macros;
+
+    enum RegisterChangeFlag
+    {
+        RCF_NONE,
+        RCF_REGISTER_ALL,
+        RCF_UNREGISTER_ALL,
+    };
+
+    std::atomic<RegisterChangeFlag> register_change_flag = RegisterChangeFlag::RCF_NONE;
 };
 
 }
diff --git a/src/Interpreters/ClusterFunctionReadTask.cpp b/src/Interpreters/ClusterFunctionReadTask.cpp
index 6eda3481a78f..fdacbda9270b 100644
--- a/src/Interpreters/ClusterFunctionReadTask.cpp
+++ b/src/Interpreters/ClusterFunctionReadTask.cpp
@@ -41,8 +41,16 @@ ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(ObjectInfoPtr o
     }
 #endif
 
-    const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes];
-    path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath();
+    file_meta_info = object->relative_path_with_metadata.file_meta_info;
+
+    if (object->relative_path_with_metadata.getCommand().isValid())
+        path = object->relative_path_with_metadata.getCommand().toString();
+    else
+    {
+        const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes];
+        path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath();
+    }
+
     file_bucket_info = object->file_bucket_info;
 }
 
@@ -74,6 +82,8 @@ ObjectInfoPtr ClusterFunctionReadTaskResponse::getObjectInfo() const
     }
     object->data_lake_metadata = data_lake_metadata;
     object->file_bucket_info = file_bucket_info;
+    if (file_meta_info.has_value())
+        object->relative_path_with_metadata.file_meta_info = file_meta_info;
 
     return object;
 }
diff --git a/src/Interpreters/ClusterFunctionReadTask.h b/src/Interpreters/ClusterFunctionReadTask.h
index d94133d252b5..c41fa8293e75 100644
--- a/src/Interpreters/ClusterFunctionReadTask.h
+++ b/src/Interpreters/ClusterFunctionReadTask.h
@@ -25,6 +25,8 @@ struct ClusterFunctionReadTaskResponse
     DataLakeObjectMetadata data_lake_metadata;
     /// Iceberg object metadata
     std::optional<Iceberg::IcebergObjectSerializableInfo> iceberg_info;
+    /// File's columns info
+    std::optional<DataFileMetaInfoPtr> file_meta_info;
 
     /// Convert received response into ObjectInfo.
     ObjectInfoPtr getObjectInfo() const;
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 92ae95d7e9c9..2304133f54f8 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -262,6 +262,7 @@ namespace CurrentMetrics
     extern const Metric IndexUncompressedCacheCells;
     extern const Metric ZooKeeperSessionExpired;
     extern const Metric ZooKeeperConnectionLossStartedTimestampSeconds;
+    extern const Metric IsSwarmModeEnabled;
 }
 
 
@@ -696,6 +697,7 @@ struct ContextSharedPart : boost::noncopyable
     std::map<String, UInt16> server_ports;
 
     std::atomic<bool> shutdown_called = false;
+    std::atomic<bool> swarm_mode_enabled = true;
 
     Stopwatch uptime_watch TSA_GUARDED_BY(mutex);
 
@@ -867,6 +869,8 @@ struct ContextSharedPart : boost::noncopyable
       */
     void shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS
     {
+        swarm_mode_enabled = false;
+        CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0);
         bool is_shutdown_called = shutdown_called.exchange(true);
         if (is_shutdown_called)
             return;
@@ -5565,7 +5569,6 @@ std::shared_ptr<Cluster> Context::getCluster(const std::string & cluster_name) c
     throw Exception(ErrorCodes::CLUSTER_DOESNT_EXIST, "Requested cluster '{}' not found", cluster_name);
 }
 
-
 std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name) const
 {
     std::shared_ptr<Cluster> res = nullptr;
@@ -5584,6 +5587,21 @@ std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name
     return res;
 }
 
+void Context::unregisterInAutodiscoveryClusters()
+{
+    std::lock_guard lock(shared->clusters_mutex);
+    if (!shared->cluster_discovery)
+        return;
+    shared->cluster_discovery->unregisterAll();
+}
+
+void Context::registerInAutodiscoveryClusters()
+{
+    std::lock_guard lock(shared->clusters_mutex);
+    if (!shared->cluster_discovery)
+        return;
+    shared->cluster_discovery->registerAll();
+}
 
 void Context::reloadClusterConfig() const
 {
@@ -6563,12 +6581,35 @@ void Context::stopServers(const ServerType & server_type) const
     shared->stop_servers_callback(server_type);
 }
 
-
 void Context::shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS
 {
     shared->shutdown();
 }
 
+bool Context::stopSwarmMode()
+{
+    bool expected_is_enabled = true;
+    bool is_stopped_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, false);
+    if (is_stopped_now)
+        CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0);
+    // return true if stop successful
+    return is_stopped_now;
+}
+
+bool Context::startSwarmMode()
+{
+    bool expected_is_enabled = false;
+    bool is_started_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, true);
+    if (is_started_now)
+        CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 1);
+    // return true if start successful
+    return is_started_now;
+}
+
+bool Context::isSwarmModeEnabled() const
+{
+    return shared->swarm_mode_enabled;
+}
 
 Context::ApplicationType Context::getApplicationType() const
 {
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index b10d3a3f2ce4..74730f22107e 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -1476,6 +1476,8 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>
     size_t getClustersVersion() const;
 
     void startClusterDiscovery();
+    void registerInAutodiscoveryClusters();
+    void unregisterInAutodiscoveryClusters();
 
     /// Sets custom cluster, but doesn't update configuration
     void setCluster(const String & cluster_name, const std::shared_ptr<Cluster> & cluster);
@@ -1595,6 +1597,15 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>
 
     void shutdown();
 
+    /// Stop some works to allow graceful shutdown later.
+    /// Returns true if stop successful.
+    bool stopSwarmMode();
+    /// Resume some works if we change our mind.
+    /// Returns true if start successful.
+    bool startSwarmMode();
+    /// Return current swarm mode state.
+    bool isSwarmModeEnabled() const;
+
     bool isInternalQuery() const { return is_internal_query; }
     void setInternalQuery(bool internal) { is_internal_query = internal; }
 
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 8602cf1c36be..c5058e1879f8 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -785,6 +785,20 @@ BlockIO InterpreterSystemQuery::execute()
         case Type::START_MOVES:
             startStopAction(ActionLocks::PartsMove, true);
             break;
+        case Type::STOP_SWARM_MODE:
+        {
+            getContext()->checkAccess(AccessType::SYSTEM_SWARM);
+            if (getContext()->stopSwarmMode())
+                getContext()->unregisterInAutodiscoveryClusters();
+            break;
+        }
+        case Type::START_SWARM_MODE:
+        {
+            getContext()->checkAccess(AccessType::SYSTEM_SWARM);
+            if (getContext()->startSwarmMode())
+                getContext()->registerInAutodiscoveryClusters();
+            break;
+        }
         case Type::STOP_FETCHES:
             startStopAction(ActionLocks::PartsFetch, false);
             break;
@@ -2255,6 +2269,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
                 required_access.emplace_back(AccessType::SYSTEM_MOVES, query.getDatabase(), query.getTable());
             break;
         }
+        case Type::STOP_SWARM_MODE:
+        case Type::START_SWARM_MODE:
+        {
+            required_access.emplace_back(AccessType::SYSTEM_SWARM);
+            break;
+        }
         case Type::STOP_PULLING_REPLICATION_LOG:
         case Type::START_PULLING_REPLICATION_LOG:
         {
diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index 5fd99ca62aa5..49c6611b7b5e 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -627,6 +627,8 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti
         case Type::RECONNECT_ZOOKEEPER:
         case Type::FREE_MEMORY:
         case Type::RESET_DDL_WORKER:
+        case Type::STOP_SWARM_MODE:
+        case Type::START_SWARM_MODE:
             break;
         case Type::UNKNOWN:
         case Type::END:
diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h
index 8a062cbf1646..3d4b087207d1 100644
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@@ -91,6 +91,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster
         START_FETCHES,
         STOP_MOVES,
         START_MOVES,
+        STOP_SWARM_MODE,
+        START_SWARM_MODE,
         STOP_REPLICATED_SENDS,
         START_REPLICATED_SENDS,
         STOP_REPLICATION_QUEUES,
diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp
index 0564131c8768..17a095db7780 100644
--- a/src/Processors/Chunk.cpp
+++ b/src/Processors/Chunk.cpp
@@ -108,7 +108,12 @@ void Chunk::addColumn(ColumnPtr column)
 
 void Chunk::addColumn(size_t position, ColumnPtr column)
 {
-    if (position >= columns.size())
+    if (position == columns.size())
+    {
+        addColumn(column);
+        return;
+    }
+    if (position > columns.size())
         throw Exception(ErrorCodes::POSITION_OUT_OF_BOUND,
                         "Position {} out of bound in Chunk::addColumn(), max position = {}",
                         position, !columns.empty() ? columns.size() - 1 : 0);
diff --git a/src/Processors/Sources/ConstChunkGenerator.h b/src/Processors/Sources/ConstChunkGenerator.h
index 8232bd5f0dff..ca120fc23d23 100644
--- a/src/Processors/Sources/ConstChunkGenerator.h
+++ b/src/Processors/Sources/ConstChunkGenerator.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Processors/ISource.h>
+#include <Processors/Formats/IInputFormat.h>
 
 
 namespace DB
@@ -13,7 +14,7 @@ class ConstChunkGenerator : public ISource
 public:
     ConstChunkGenerator(SharedHeader header, size_t total_num_rows, size_t max_block_size_)
         : ISource(std::move(header))
-        , remaining_rows(total_num_rows), max_block_size(max_block_size_)
+        , generated_rows(0), remaining_rows(total_num_rows), max_block_size(max_block_size_)
     {
     }
 
@@ -27,10 +28,14 @@ class ConstChunkGenerator : public ISource
 
         size_t num_rows = std::min(max_block_size, remaining_rows);
         remaining_rows -= num_rows;
-        return cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows);
+        auto chunk = cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows);
+        chunk.getChunkInfos().add(std::make_shared<ChunkInfoRowNumbers>(generated_rows));
+        generated_rows += num_rows;
+        return chunk;
     }
 
 private:
+    size_t generated_rows;
     size_t remaining_rows;
     size_t max_block_size;
 };
diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp
index 3ff9ef0a9827..7c4bf61113e9 100644
--- a/src/QueryPipeline/RemoteQueryExecutor.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutor.cpp
@@ -662,7 +662,11 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet
             /// We can actually return it, and the first call to RemoteQueryExecutor::read
             /// will return earlier. We should consider doing it.
             if (!packet.block.empty() && (packet.block.rows() > 0))
+            {
+                if (extension && extension->replica_info)
+                    replica_has_processed_data.insert(extension->replica_info->number_of_current_replica);
                 return ReadResult(adaptBlockStructure(packet.block, *header));
+            }
             break;  /// If the block is empty - we will receive other packets before EndOfStream.
 
         case Protocol::Server::Exception:
@@ -724,6 +728,19 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet
         case Protocol::Server::TimezoneUpdate:
             break;
 
+        case Protocol::Server::ConnectionLost:
+            if (extension && extension->task_iterator && extension->task_iterator->supportRerunTask() && extension->replica_info)
+            {
+                if (!replica_has_processed_data.contains(extension->replica_info->number_of_current_replica))
+                {
+                    finished = true;
+                    extension->task_iterator->rescheduleTasksFromReplica(extension->replica_info->number_of_current_replica);
+                    return ReadResult(Block{});
+                }
+            }
+            packet.exception->rethrow();
+            break;
+
         default:
             got_unknown_packet_from_replica = true;
             throw Exception(
@@ -1005,6 +1022,11 @@ void RemoteQueryExecutor::setProfileInfoCallback(ProfileInfoCallback callback)
     profile_info_callback = std::move(callback);
 }
 
+bool RemoteQueryExecutor::skipUnavailableShards() const
+{
+    return context->getSettingsRef()[Setting::skip_unavailable_shards];
+}
+
 bool RemoteQueryExecutor::needToSkipUnavailableShard()
 {
     if (context->getSettingsRef()[Setting::skip_unavailable_shards] && (0 == connections->size()))
diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h
index e57aad64813b..8c5b1144cf82 100644
--- a/src/QueryPipeline/RemoteQueryExecutor.h
+++ b/src/QueryPipeline/RemoteQueryExecutor.h
@@ -34,7 +34,22 @@ class RemoteQueryExecutorReadContext;
 
 class ParallelReplicasReadingCoordinator;
 
-using TaskIterator = std::function<ClusterFunctionReadTaskResponsePtr(size_t)>;
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+};
+
+class TaskIterator
+{
+public:
+    virtual ~TaskIterator() = default;
+    virtual bool supportRerunTask() const { return false; }
+    virtual void rescheduleTasksFromReplica(size_t /* number_of_current_replica */)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method rescheduleTasksFromReplica is not implemented");
+    }
+    virtual ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const = 0;
+};
 
 /// This class allows one to launch queries on remote replicas of one shard and get results
 class RemoteQueryExecutor
@@ -223,6 +238,8 @@ class RemoteQueryExecutor
 
     IConnections & getConnections() { return *connections; }
 
+    bool skipUnavailableShards() const;
+
     bool needToSkipUnavailableShard();
 
     bool isReplicaUnavailable() const { return extension && extension->parallel_reading_coordinator && connections->size() == 0; }
@@ -329,6 +346,8 @@ class RemoteQueryExecutor
 
     const bool read_packet_type_separately = false;
 
+    std::unordered_set<size_t> replica_has_processed_data;
+
     /// Send all scalars to remote servers
     void sendScalars();
 
diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
index f0598ac89345..55340cfc59aa 100644
--- a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
@@ -17,10 +17,13 @@ namespace ErrorCodes
     extern const int CANNOT_READ_FROM_SOCKET;
     extern const int CANNOT_OPEN_FILE;
     extern const int SOCKET_TIMEOUT;
+    extern const int ATTEMPT_TO_READ_AFTER_EOF;
 }
 
 RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(
-    RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_)
+    RemoteQueryExecutor & executor_,
+    bool suspend_when_query_sent_,
+    bool read_packet_type_separately_)
     : AsyncTaskExecutor(std::make_unique<Task>(*this))
     , executor(executor_)
     , suspend_when_query_sent(suspend_when_query_sent_)
@@ -55,19 +58,42 @@ void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, Sus
     if (read_context.executor.needToSkipUnavailableShard())
         return;
 
-    while (true)
+    try
     {
-        read_context.has_read_packet_part = PacketPart::None;
+        while (true)
+        {
+            read_context.has_read_packet_part = PacketPart::None;
+
+            if (read_context.read_packet_type_separately)
+            {
+                read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback);
+                read_context.has_read_packet_part = PacketPart::Type;
+                suspend_callback();
+            }
+            read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback);
+            read_context.has_read_packet_part = PacketPart::Body;
+            if (read_context.packet.type == Protocol::Server::Data && read_context.packet.block.rows() > 0)
+                read_context.has_data_packets = true;
 
-        if (read_context.read_packet_type_separately)
+            suspend_callback();
+        }
+    }
+    catch (const Exception & e)
+    {
+        /// If cluster node unxepectedly shutted down (kill/segfault/power off/etc.) socket just closes.
+        /// If initiator did not process any data packets before, this fact can be ignored.
+        /// Unprocessed tasks will be executed on other nodes.
+        if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF
+            && !read_context.has_data_packets.load()
+            && read_context.executor.skipUnavailableShards())
         {
-            read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback);
-            read_context.has_read_packet_part = PacketPart::Type;
+            read_context.packet.type = Protocol::Server::ConnectionLost;
+            read_context.packet.exception = std::make_unique<Exception>(getCurrentExceptionMessageAndPattern(true), getCurrentExceptionCode());
+            read_context.has_read_packet_part = PacketPart::Body;
             suspend_callback();
         }
-        read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback);
-        read_context.has_read_packet_part = PacketPart::Body;
-        suspend_callback();
+        else
+            throw;
     }
 }
 
diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.h b/src/QueryPipeline/RemoteQueryExecutorReadContext.h
index abde6cb93ef3..16ff5dc67a69 100644
--- a/src/QueryPipeline/RemoteQueryExecutorReadContext.h
+++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.h
@@ -26,7 +26,9 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor
 {
 public:
     explicit RemoteQueryExecutorReadContext(
-        RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_);
+        RemoteQueryExecutor & executor_,
+        bool suspend_when_query_sent_,
+        bool read_packet_type_separately_);
 
     ~RemoteQueryExecutorReadContext() override;
 
@@ -85,6 +87,7 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor
     /// None -> Type -> Body -> None
     /// None -> Body -> None
     std::atomic<PacketPart> has_read_packet_part = PacketPart::None;
+    std::atomic_bool has_data_packets = false;
     Packet packet;
 
     RemoteQueryExecutor & executor;
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
index 6eed86d81e7d..c0fb681f9595 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
@@ -1,9 +1,19 @@
 #include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Core/Field.h>
+#include <Common/logger_useful.h>
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+};
+
 namespace
 {
 
@@ -87,4 +97,185 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat(
     return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, context, supports_subset_of_columns, supports_tuple_elements);
 }
 
+DataFileMetaInfo::DataFileMetaInfo(
+    const Iceberg::IcebergSchemaProcessor & schema_processor,
+    Int32 schema_id,
+    const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_)
+{
+
+    std::vector<Int32> column_ids;
+    for (const auto & column : columns_info_)
+        column_ids.push_back(column.first);
+
+    auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids);
+    std::unordered_map<Int32, std::string> name_by_index;
+    for (const auto & name_and_type : name_and_types)
+    {
+        const auto name = name_and_type.getNameInStorage();
+        auto index = schema_processor.tryGetColumnIDByName(schema_id, name);
+        if (index.has_value())
+            name_by_index[index.value()] = name;
+    }
+
+    for (const auto & column : columns_info_)
+    {
+        auto i_name = name_by_index.find(column.first);
+        if (i_name != name_by_index.end())
+        {
+            columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, column.second.hyperrectangle};
+        }
+    }
+}
+
+DataFileMetaInfo::DataFileMetaInfo(Poco::JSON::Object::Ptr file_info)
+{
+    if (!file_info)
+        return;
+
+    auto log = getLogger("DataFileMetaInfo");
+
+    if (file_info->has("columns"))
+    {
+        auto columns = file_info->getArray("columns");
+        for (size_t i = 0; i < columns->size(); ++i)
+        {
+            auto column = columns->getObject(static_cast<UInt32>(i));
+
+            std::string name;
+            if (column->has("name"))
+                name = column->get("name").toString();
+            else
+            {
+                LOG_WARNING(log, "Can't read column name, ignored");
+                continue;
+            }
+
+            DB::DataFileMetaInfo::ColumnInfo column_info;
+            if (column->has("rows"))
+                column_info.rows_count = column->get("rows");
+            if (column->has("nulls"))
+                column_info.nulls_count = column->get("nulls");
+            if (column->has("range"))
+            {
+                Range range("");
+                std::string r = column->get("range");
+                try
+                {
+                    range.deserialize(r, /*base64*/ true);
+                    column_info.hyperrectangle = std::move(range);
+                }
+                catch (const Exception & e)
+                {
+                    LOG_WARNING(log, "Can't read range for column {}, range '{}' ignored, error: {}", name, r, e.what());
+                }
+            }
+
+            columns_info[name] = column_info;
+        }
+    }
+}
+
+Poco::JSON::Object::Ptr DataFileMetaInfo::toJson() const
+{
+    Poco::JSON::Object::Ptr file_info = new Poco::JSON::Object();
+
+    if (!columns_info.empty())
+    {
+        Poco::JSON::Array::Ptr columns = new Poco::JSON::Array();
+
+        for (const auto & column : columns_info)
+        {
+            Poco::JSON::Object::Ptr column_info = new Poco::JSON::Object();
+            column_info->set("name", column.first);
+            if (column.second.rows_count.has_value())
+                column_info->set("rows", column.second.rows_count.value());
+            if (column.second.nulls_count.has_value())
+                column_info->set("nulls", column.second.nulls_count.value());
+            if (column.second.hyperrectangle.has_value())
+                column_info->set("range", column.second.hyperrectangle.value().serialize(/*base64*/ true));
+
+            columns->add(column_info);
+        }
+
+        file_info->set("columns", columns);
+    }
+
+    return file_info;
+}
+
+constexpr size_t FIELD_MASK_ROWS = 0x1;
+constexpr size_t FIELD_MASK_NULLS = 0x2;
+constexpr size_t FIELD_MASK_RECT = 0x4;
+constexpr size_t FIELD_MASK_ALL = 0x7;
+
+void DataFileMetaInfo::serialize(WriteBuffer & out) const
+{
+    auto size = columns_info.size();
+    writeIntBinary(size, out);
+    for (const auto & column : columns_info)
+    {
+        writeStringBinary(column.first, out);
+        size_t field_mask = 0;
+        if (column.second.rows_count.has_value())
+            field_mask |= FIELD_MASK_ROWS;
+        if (column.second.nulls_count.has_value())
+            field_mask |= FIELD_MASK_NULLS;
+        if (column.second.hyperrectangle.has_value())
+            field_mask |= FIELD_MASK_RECT;
+        writeIntBinary(field_mask, out);
+
+        if (column.second.rows_count.has_value())
+            writeIntBinary(column.second.rows_count.value(), out);
+        if (column.second.nulls_count.has_value())
+            writeIntBinary(column.second.nulls_count.value(), out);
+        if (column.second.hyperrectangle.has_value())
+        {
+            writeFieldBinary(column.second.hyperrectangle.value().left, out);
+            writeFieldBinary(column.second.hyperrectangle.value().right, out);
+        }
+    }
+}
+
+DataFileMetaInfo DataFileMetaInfo::deserialize(ReadBuffer & in)
+{
+    DataFileMetaInfo result;
+
+    size_t size;
+    readIntBinary(size, in);
+    
+    for (size_t i = 0; i < size; ++i)
+    {
+        std::string name;
+        readStringBinary(name, in);
+        size_t field_mask;
+        readIntBinary(field_mask, in);
+        if ((field_mask & FIELD_MASK_ALL) != field_mask)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected field mask: {}", field_mask);
+
+        ColumnInfo & column = result.columns_info[name];
+
+        if (field_mask & FIELD_MASK_ROWS)
+        {
+            Int64 value;
+            readIntBinary(value, in);
+            column.rows_count = value;
+        }
+        if (field_mask & FIELD_MASK_NULLS)
+        {
+            Int64 value;
+            readIntBinary(value, in);
+            column.nulls_count = value;
+        }
+        if (field_mask & FIELD_MASK_RECT)
+        {
+            FieldRef left = readFieldBinary(in);
+            FieldRef right = readFieldBinary(in);
+            column.hyperrectangle = Range(left, true, right, true);
+        }
+    }
+
+    return result;
+}
+
+
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index c7421143a7cc..945618a97ad1 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -5,6 +5,7 @@
 
 #include <Core/NamesAndTypes.h>
 #include <Core/Types.h>
+#include <Core/Range.h>
 #include <Databases/DataLake/ICatalog.h>
 #include <Formats/FormatFilterInfo.h>
 #include <Formats/FormatParserSharedResources.h>
@@ -18,6 +19,8 @@
 #include <Storages/prepareReadingFromFormat.h>
 #include <Disks/DiskType.h>
 #include <IO/WriteBuffer.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
+
 
 namespace DataLake
 {
@@ -30,7 +33,64 @@ namespace DB
 namespace ErrorCodes
 {
 extern const int UNSUPPORTED_METHOD;
-}
+};
+
+namespace Iceberg
+{
+struct ColumnInfo;
+};
+
+class DataFileMetaInfo
+{
+public:
+    DataFileMetaInfo() = default;
+
+    // Deserialize from json in distributed requests
+    explicit DataFileMetaInfo(const Poco::JSON::Object::Ptr file_info);
+
+    // Serialize to json in distributed requests
+    Poco::JSON::Object::Ptr toJson() const;
+
+    // subset of Iceberg::ColumnInfo now
+    struct ColumnInfo
+    {
+        std::optional<Int64> rows_count;
+        std::optional<Int64> nulls_count;
+        std::optional<DB::Range> hyperrectangle;
+    };
+
+    // Extract metadata from Iceberg structure
+    explicit DataFileMetaInfo(
+        const Iceberg::IcebergSchemaProcessor & schema_processor,
+        Int32 schema_id,
+        const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_);
+
+    void serialize(WriteBuffer & out) const;
+    static DataFileMetaInfo deserialize(ReadBuffer & in);
+
+    bool empty() const { return columns_info.empty(); }
+
+    std::unordered_map<std::string, ColumnInfo> columns_info;
+};
+
+using DataFileMetaInfoPtr = std::shared_ptr<DataFileMetaInfo>;
+
+struct DataFileInfo
+{
+    std::string file_path;
+    std::optional<DataFileMetaInfoPtr> file_meta_info;
+
+    explicit DataFileInfo(const std::string & file_path_)
+        : file_path(file_path_) {}
+
+    explicit DataFileInfo(std::string && file_path_)
+        : file_path(std::move(file_path_)) {}
+
+    bool operator==(const DataFileInfo & rhs) const
+    {
+        return file_path == rhs.file_path;
+    }
+};
 
 class SinkToStorage;
 using SinkToStoragePtr = std::shared_ptr<SinkToStorage>;
@@ -38,7 +98,6 @@ class StorageObjectStorageConfiguration;
 using StorageObjectStorageConfigurationPtr = std::shared_ptr<StorageObjectStorageConfiguration>;
 struct StorageID;
 struct IObjectIterator;
-struct RelativePathWithMetadata;
 class IObjectStorage;
 struct ObjectInfo;
 using ObjectInfoPtr = std::shared_ptr<ObjectInfo>;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
index 6141e20cc6b5..bf22cc15ac6e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
@@ -272,6 +272,7 @@ IcebergIterator::IcebergIterator(
     , blocking_queue(100)
     , producer_task(std::nullopt)
     , callback(std::move(callback_))
+    , table_schema_id(table_snapshot_->schema_id)
 {
     auto delete_file = deletes_iterator.next();
     while (delete_file.has_value())
@@ -393,6 +394,10 @@ ObjectInfoPtr IcebergIterator::next(size_t)
                 object_info->info.data_object_file_path_key);
         }
 
+        object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared<DataFileMetaInfo>(
+                                    *persistent_components.schema_processor,
+                                    table_schema_id, /// current schema id to use current column names
+                                    manifest_file_entry->parsed_entry->columns_infos));
         ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos);
         return object_info;
     }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h
index 625b19897463..95e372ccfb5c 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h
@@ -74,7 +74,7 @@ class IcebergIterator : public IObjectIterator
         IDataLakeMetadata::FileProgressCallback callback_,
         Iceberg::TableStateSnapshotPtr table_snapshot_,
         Iceberg::IcebergDataSnapshotPtr data_snapshot_,
-        Iceberg::PersistentTableComponents persistent_components);
+        Iceberg::PersistentTableComponents persistent_components_);
 
     ObjectInfoPtr next(size_t) override;
 
@@ -96,6 +96,7 @@ class IcebergIterator : public IObjectIterator
     std::vector<Iceberg::ProcessedManifestFileEntryPtr> equality_deletes_files;
     std::exception_ptr exception;
     std::mutex exception_mutex;
+    Int32 table_schema_id;
 };
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 12379496f211..f8ab0a7e5f3e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1136,12 +1136,14 @@ void IcebergMetadata::addDeleteTransformers(
 
     if (!iceberg_object_info->info.position_deletes_objects.empty())
     {
+        LOG_DEBUG(log, "Constructing filter transform for position delete, there are {} delete objects", iceberg_object_info->info.position_deletes_objects.size());
         builder.addSimpleTransform(
             [&](const SharedHeader & header)
             { return iceberg_object_info->getPositionDeleteTransformer(object_storage, header, format_settings, parser_shared_resources, local_context); });
     }
     const auto & delete_files = iceberg_object_info->info.equality_deletes_objects;
-    LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size());
+    if (!delete_files.empty())
+        LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size());
     for (const EqualityDeleteObject & delete_file : delete_files)
     {
         auto simple_transform_adder = [&](const SharedHeader & header)
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 1ccf5240c15e..c7453440512e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -1,6 +1,23 @@
 #pragma once
 
 #include "config.h"
+#include <Core/Range.h>
+#include <Core/Types.h>
+
+#include <optional>
+
+namespace DB::Iceberg
+{
+
+struct ColumnInfo
+{
+    std::optional<Int64> rows_count;
+    std::optional<Int64> bytes_size;
+    std::optional<Int64> nulls_count;
+    std::optional<DB::Range> hyperrectangle;
+};
+
+}
 
 #if USE_AVRO
 
@@ -43,13 +60,6 @@ enum class ManifestFileContentType
 
 String FileContentTypeToString(FileContentType type);
 
-struct ColumnInfo
-{
-    std::optional<Int64> rows_count;
-    std::optional<Int64> bytes_size;
-    std::optional<Int64> nulls_count;
-};
-
 struct PartitionSpecsEntry
 {
     Int32 source_id;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
index 9af7bb7c6fdb..e46b7272cc57 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
@@ -26,11 +26,14 @@ namespace Setting
     extern const SettingsBool use_hive_partitioning;
     extern const SettingsBool cluster_function_process_archive_on_multiple_nodes;
     extern const SettingsObjectStorageGranularityLevel cluster_table_function_split_granularity;
+    extern const SettingsUInt64 lock_object_storage_task_distribution_ms;
+    extern const SettingsBool allow_experimental_iceberg_read_optimization;
 }
 
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
+    extern const int INVALID_SETTING_VALUE;
 }
 
 String StorageObjectStorageCluster::getPathSample(ContextPtr context)
@@ -232,6 +235,42 @@ void StorageObjectStorageCluster::updateExternalDynamicMetadataIfExists(ContextP
     setInMemoryMetadata(new_metadata);
 }
 
+class TaskDistributor : public TaskIterator
+{
+public:
+    TaskDistributor(std::shared_ptr<IObjectIterator> iterator,
+        std::vector<std::string> && ids_of_hosts,
+        bool send_over_whole_archive,
+        uint64_t lock_object_storage_task_distribution_ms,
+        ContextPtr context_,
+        bool iceberg_read_optimization_enabled)
+        : task_distributor(
+            iterator,
+            std::move(ids_of_hosts),
+            send_over_whole_archive,
+            lock_object_storage_task_distribution_ms,
+            iceberg_read_optimization_enabled)
+        , context(context_) {}
+    ~TaskDistributor() override = default;
+    bool supportRerunTask() const override { return true; }
+    void rescheduleTasksFromReplica(size_t number_of_current_replica) override
+    {
+        task_distributor.rescheduleTasksFromReplica(number_of_current_replica);
+    }
+
+    ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const override
+    {
+        auto task = task_distributor.getNextTask(number_of_current_replica);
+        if (task)
+            return std::make_shared<ClusterFunctionReadTaskResponse>(std::move(task), context);
+        return std::make_shared<ClusterFunctionReadTaskResponse>();
+    }
+
+private:
+    mutable StorageObjectStorageStableTaskDistributor task_distributor;
+    ContextPtr context;
+};
+
 RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension(
     const ActionsDAG::Node * predicate,
     const ActionsDAG * filter,
@@ -278,19 +317,24 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten
         }
     }
 
-    auto task_distributor = std::make_shared<StorageObjectStorageStableTaskDistributor>(
-        iterator,
-        std::move(ids_of_hosts),
-        /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]);
+    uint64_t lock_object_storage_task_distribution_ms = local_context->getSettingsRef()[Setting::lock_object_storage_task_distribution_ms];
 
-    auto callback = std::make_shared<TaskIterator>(
-        [task_distributor, local_context](size_t number_of_current_replica) mutable -> ClusterFunctionReadTaskResponsePtr
-        {
-            auto task = task_distributor->getNextTask(number_of_current_replica);
-            if (task)
-                return std::make_shared<ClusterFunctionReadTaskResponse>(std::move(task), local_context);
-            return std::make_shared<ClusterFunctionReadTaskResponse>();
-        });
+    /// Check value to avoid negative result after conversion in microseconds.
+    /// Poco::Timestamp::TimeDiff is signed int 64.
+    static const uint64_t lock_object_storage_task_distribution_ms_max = 0x0020000000000000ULL;
+    if (lock_object_storage_task_distribution_ms > lock_object_storage_task_distribution_ms_max)
+        throw Exception(ErrorCodes::INVALID_SETTING_VALUE,
+            "Value lock_object_storage_task_distribution_ms is too big: {}, allowed maximum is {}",
+            lock_object_storage_task_distribution_ms,
+            lock_object_storage_task_distribution_ms_max
+        );
+
+    auto callback = std::make_shared<TaskDistributor>(iterator,
+        std::move(ids_of_hosts),
+        /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes],
+        lock_object_storage_task_distribution_ms,
+        local_context,
+        /* iceberg_read_optimization_enabled */local_context->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization]);
 
     return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) };
 }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index 2a13d2d09603..1f3e1e7419ba 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -34,6 +34,7 @@
 #include <Storages/ObjectStorage/DataLakes/DeletionVectorTransform.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h>
 #include <Storages/ObjectStorage/Utils.h>
 #include <Storages/VirtualColumnUtils.h>
 #include <boost/operators.hpp>
@@ -58,6 +59,8 @@ namespace ProfileEvents
     extern const Event ObjectStorageGlobFilteredObjects;
     extern const Event ObjectStoragePredicateFilteredObjects;
     extern const Event ObjectStorageReadObjects;
+    extern const Event ObjectStorageClusterProcessedTasks;
+    extern const Event ObjectStorageClusterWaitingMicroseconds;
 }
 
 namespace CurrentMetrics
@@ -82,6 +85,7 @@ namespace Setting
     extern const SettingsUInt64 s3_path_filter_limit;
     extern const SettingsBool use_parquet_metadata_cache;
     extern const SettingsBool input_format_parquet_use_native_reader_v3;
+    extern const SettingsBool allow_experimental_iceberg_read_optimization;
 }
 
 namespace ErrorCodes
@@ -399,6 +403,16 @@ Chunk StorageObjectStorageSource::generate()
                 },
                 read_context);
 
+            /// Not empty when allow_experimental_iceberg_read_optimization=true
+            /// and some columns were removed from read list as columns with constant values.
+            /// Restore data for these columns.
+            for (const auto & constant_column : reader.constant_columns_with_values)
+            {
+                chunk.addColumn(constant_column.first,
+                    constant_column.second.name_and_type.type->createColumnConst(
+                        chunk.getNumRows(), constant_column.second.value));
+            }
+
 #if USE_PARQUET
             if (chunk_size && chunk.hasColumns())
             {
@@ -545,11 +559,34 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
     ObjectInfoPtr object_info;
     auto query_settings = configuration->getQuerySettings(context_);
 
+    bool not_a_path = false;
+
     do
     {
+        not_a_path = false;
         object_info = file_iterator->next(processor);
 
-        if (!object_info || object_info->getPath().empty())
+        if (!object_info)
+            return {};
+
+        if (object_info->relative_path_with_metadata.getCommand().isValid())
+        {
+            auto retry_after_us = object_info->relative_path_with_metadata.getCommand().getRetryAfterUs();
+            if (retry_after_us.has_value())
+            {
+                not_a_path = true;
+                /// TODO: Make asyncronous waiting without sleep in thread
+                /// Now this sleep is on executor node in worker thread
+                /// Does not block query initiator
+                auto wait_time = std::min(Poco::Timestamp::TimeDiff(100000ul), retry_after_us.value());
+                ProfileEvents::increment(ProfileEvents::ObjectStorageClusterWaitingMicroseconds, wait_time);
+                sleepForMicroseconds(wait_time);
+                continue;
+            }
+            object_info->relative_path_with_metadata.setFileMetaInfo(object_info->relative_path_with_metadata.getCommand().getFileMetaInfo());
+        }
+
+        if (object_info->getPath().empty())
             return {};
 
         if (!object_info->getObjectMetadata())
@@ -568,16 +605,28 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
             else
                 object_info->setObjectMetadata(object_storage->getObjectMetadata(path, with_tags));
         }
-    } while (query_settings.skip_empty_files
-             && object_info->getObjectMetadata()->size_bytes == 0
-             && object_info->getObjectMetadata()->is_size_known);
+    }
+    while (not_a_path
+           || (query_settings.skip_empty_files
+               && object_info->getObjectMetadata()->size_bytes == 0
+               && object_info->getObjectMetadata()->is_size_known));
+
+    ProfileEvents::increment(ProfileEvents::ObjectStorageClusterProcessedTasks);
 
     QueryPipelineBuilder builder;
     std::shared_ptr<ISource> source;
     std::unique_ptr<ReadBuffer> read_buf;
+    std::optional<Int64> rows_count_from_metadata;
 
     auto try_get_num_rows_from_cache = [&]() -> std::optional<size_t>
     {
+        if (rows_count_from_metadata.has_value())
+        {
+            /// Must be non negative here
+            size_t value = rows_count_from_metadata.value();
+            return value;
+        }
+
         if (!schema_cache)
             return std::nullopt;
 
@@ -595,6 +644,128 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
         return schema_cache->tryGetNumRows(cache_key, get_last_mod_time);
     };
 
+    /// List of columns with constant value in current file, and values
+    std::map<size_t, ConstColumnWithValue> constant_columns_with_values;
+    std::unordered_set<String> constant_columns;
+
+    NamesAndTypesList requested_columns_copy = read_from_format_info.requested_columns;
+
+    std::unordered_map<String, std::pair<size_t, NameAndTypePair>> requested_columns_list;
+    {
+        size_t column_index = 0;
+        for (const auto & column : requested_columns_copy)
+            requested_columns_list[column.getNameInStorage()] = std::make_pair(column_index++, column);
+    }
+
+    if (context_->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization])
+    {
+        auto file_meta_data = object_info->relative_path_with_metadata.getFileMetaInfo();
+        if (file_meta_data.has_value())
+        {
+            bool is_all_rows_count_equals = true;
+            for (const auto & column : file_meta_data.value()->columns_info)
+            {
+                if (is_all_rows_count_equals && column.second.rows_count.has_value())
+                {
+                    if (rows_count_from_metadata.has_value())
+                    {
+                        if (column.second.rows_count.value() != rows_count_from_metadata.value())
+                        {
+                            LOG_WARNING(log, "Inconsistent rows count for file {} in metadats, ignored", object_info->getPath());
+                            is_all_rows_count_equals = false;
+                            rows_count_from_metadata = std::nullopt;
+                        }
+                    }
+                    else if (column.second.rows_count.value() < 0)
+                    {
+                        LOG_WARNING(log, "Negative rows count for file {} in metadats, ignored", object_info->getPath());
+                        is_all_rows_count_equals = false;
+                        rows_count_from_metadata = std::nullopt;
+                    }
+                    else
+                        rows_count_from_metadata = column.second.rows_count;
+                }
+
+                if (column.second.hyperrectangle.has_value())
+                {
+                    auto column_name = column.first;
+
+                    auto i_column = requested_columns_list.find(column_name);
+                    if (i_column == requested_columns_list.end())
+                        continue;
+
+                    if (column.second.hyperrectangle.value().isPoint() &&
+                        (!column.second.nulls_count.has_value() || column.second.nulls_count.value() <= 0))
+                    {
+                        /// isPoint() method checks before that left==right
+                        constant_columns_with_values[i_column->second.first] =
+                            ConstColumnWithValue{
+                                i_column->second.second,
+                                column.second.hyperrectangle.value().left
+                            };
+                        constant_columns.insert(column_name);
+
+                        LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value '{}'",
+                            object_info->getPath(),
+                            column_name,
+                            i_column->second.second.type,
+                            column.second.hyperrectangle.value().left.dump());
+                    }
+                    else if (column.second.rows_count.has_value() && column.second.nulls_count.has_value()
+                            && column.second.rows_count.value() == column.second.nulls_count.value()
+                            && i_column->second.second.type->isNullable())
+                    {
+                        constant_columns_with_values[i_column->second.first] =
+                            ConstColumnWithValue{
+                                i_column->second.second,
+                                Field()
+                            };
+                        constant_columns.insert(column_name);
+
+                        LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
+                            object_info->getPath(),
+                            column_name,
+                            i_column->second.second.type);
+                    }
+                }
+            }
+            for (const auto & column : requested_columns_list)
+            {
+                const auto & column_name = column.first;
+
+                if (file_meta_data.value()->columns_info.contains(column_name))
+                    continue;
+
+                if (!column.second.second.type->isNullable())
+                    continue;
+
+                /// Column is nullable and absent in file
+                constant_columns_with_values[column.second.first] =
+                    ConstColumnWithValue{
+                        column.second.second,
+                        Field()
+                    };
+                constant_columns.insert(column_name);
+
+                LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'",
+                    object_info->getPath(),
+                    column_name,
+                    column.second.second.type);
+            }
+        }
+
+        if (!constant_columns.empty())
+        {
+            size_t original_columns = requested_columns_copy.size();
+            requested_columns_copy = requested_columns_copy.eraseNames(constant_columns);
+            if (requested_columns_copy.size() + constant_columns.size() != original_columns)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't remove constant columns for file {} correct, fallback to read. Founded constant columns: [{}]",
+                    object_info->getPath(), constant_columns);
+            if (requested_columns_copy.empty())
+                need_only_count = true;
+        }
+    }
+
     std::optional<size_t> num_rows_from_cache
         = need_only_count && context_->getSettingsRef()[Setting::use_cache_for_count_from_files] ? try_get_num_rows_from_cache() : std::nullopt;
 
@@ -612,6 +783,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
             columns.emplace_back(type->createColumn(), type, name);
         builder.init(Pipe(std::make_shared<ConstChunkGenerator>(
                               std::make_shared<const Block>(columns), *num_rows_from_cache, max_block_size)));
+        if (!constant_columns.empty())
+            configuration->addDeleteTransformers(object_info, builder, format_settings, parser_shared_resources, context_);
     }
     else
     {
@@ -766,7 +939,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
     /// from chunk read by IInputFormat.
     builder.addSimpleTransform([&](const SharedHeader & header)
     {
-        return std::make_shared<ExtractColumnsTransform>(header, read_from_format_info.requested_columns);
+        return std::make_shared<ExtractColumnsTransform>(header, requested_columns_copy);
     });
 
     auto pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
@@ -775,7 +948,12 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
     ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
 
     return ReaderHolder(
-        object_info, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader));
+        object_info,
+        std::move(read_buf),
+        std::move(source),
+        std::move(pipeline),
+        std::move(current_reader),
+        std::move(constant_columns_with_values));
 }
 
 std::future<StorageObjectStorageSource::ReaderHolder> StorageObjectStorageSource::createReaderAsync()
@@ -1242,12 +1420,14 @@ StorageObjectStorageSource::ReaderHolder::ReaderHolder(
     std::unique_ptr<ReadBuffer> read_buf_,
     std::shared_ptr<ISource> source_,
     std::unique_ptr<QueryPipeline> pipeline_,
-    std::unique_ptr<PullingPipelineExecutor> reader_)
+    std::unique_ptr<PullingPipelineExecutor> reader_,
+    std::map<size_t, ConstColumnWithValue> && constant_columns_with_values_)
     : object_info(std::move(object_info_))
     , read_buf(std::move(read_buf_))
     , source(std::move(source_))
     , pipeline(std::move(pipeline_))
     , reader(std::move(reader_))
+    , constant_columns_with_values(std::move(constant_columns_with_values_))
 {
 }
 
@@ -1261,6 +1441,7 @@ StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexc
     source = std::move(other.source);
     read_buf = std::move(other.read_buf);
     object_info = std::move(other.object_info);
+    constant_columns_with_values = std::move(other.constant_columns_with_values);
     return *this;
 }
 
@@ -1275,6 +1456,12 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator(
     , is_archive(is_archive_)
     , object_storage(object_storage_)
 {
+    if (!getContext()->isSwarmModeEnabled())
+    {
+        LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks");
+        return;
+    }
+
     ThreadPool pool(
         CurrentMetrics::StorageObjectStorageThreads,
         CurrentMetrics::StorageObjectStorageThreadsActive,
@@ -1310,6 +1497,12 @@ ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::next(size_t)
     ObjectInfoPtr object_info;
     if (current_index >= buffer.size())
     {
+        if (!getContext()->isSwarmModeEnabled())
+        {
+            LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks");
+            return nullptr;
+        }
+
         auto task = callback();
         if (!task || task->isEmpty())
             return nullptr;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h
index 5cbe931a047e..1f8a712940af 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h
@@ -89,6 +89,12 @@ class StorageObjectStorageSource : public ISource
     size_t total_files_read = 0;
     LoggerPtr log = getLogger("StorageObjectStorageSource");
 
+    struct ConstColumnWithValue
+    {
+        NameAndTypePair name_and_type;
+        Field value;
+    };
+
     struct ReaderHolder : private boost::noncopyable
     {
     public:
@@ -97,7 +103,8 @@ class StorageObjectStorageSource : public ISource
             std::unique_ptr<ReadBuffer> read_buf_,
             std::shared_ptr<ISource> source_,
             std::unique_ptr<QueryPipeline> pipeline_,
-            std::unique_ptr<PullingPipelineExecutor> reader_);
+            std::unique_ptr<PullingPipelineExecutor> reader_,
+            std::map<size_t, ConstColumnWithValue> && constant_columns_with_values_);
 
         ReaderHolder() = default;
         ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); }
@@ -116,6 +123,9 @@ class StorageObjectStorageSource : public ISource
         std::shared_ptr<ISource> source;
         std::unique_ptr<QueryPipeline> pipeline;
         std::unique_ptr<PullingPipelineExecutor> reader;
+
+    public:
+        std::map<size_t, ConstColumnWithValue> constant_columns_with_values;
     };
 
     ReaderHolder reader;
@@ -255,7 +265,7 @@ class StorageObjectStorageSource::KeysIterator : public IObjectIterator
     const ObjectStoragePtr object_storage;
     const NamesAndTypesList virtual_columns;
     const std::function<void(FileProgress)> file_progress_callback;
-    const std::vector<String> keys;
+    const Strings keys;
     std::atomic<size_t> index = 0;
     const bool ignore_non_existent_files;
     const bool skip_object_metadata;
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp
index 5a54601b171c..c661bf572a08 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp
@@ -3,40 +3,71 @@
 #include <consistent_hashing.h>
 #include <optional>
 
+namespace ProfileEvents
+{
+    extern const Event ObjectStorageClusterSentToMatchedReplica;
+    extern const Event ObjectStorageClusterSentToNonMatchedReplica;
+};
+
 namespace DB
 {
 
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
-}
+    extern const int CANNOT_READ_ALL_DATA;
+};
 
 StorageObjectStorageStableTaskDistributor::StorageObjectStorageStableTaskDistributor(
     std::shared_ptr<IObjectIterator> iterator_,
     std::vector<std::string> && ids_of_nodes_,
-    bool send_over_whole_archive_)
+    bool send_over_whole_archive_,
+    uint64_t lock_object_storage_task_distribution_ms_,
+    bool iceberg_read_optimization_enabled_)
     : iterator(std::move(iterator_))
     , send_over_whole_archive(send_over_whole_archive_)
     , connection_to_files(ids_of_nodes_.size())
     , ids_of_nodes(std::move(ids_of_nodes_))
+    , lock_object_storage_task_distribution_us(lock_object_storage_task_distribution_ms_ * 1000)
     , iterator_exhausted(false)
+    , iceberg_read_optimization_enabled(iceberg_read_optimization_enabled_)
 {
+    Poco::Timestamp now;
+    size_t nodes = ids_of_nodes.size();
+    for (size_t i = 0; i < nodes; ++i)
+    {
+        replica_to_files_to_be_processed[i] = std::list<ObjectInfoPtr>{};
+        last_node_activity[i] = now;
+    }
 }
 
 ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getNextTask(size_t number_of_current_replica)
 {
     LOG_TRACE(log, "Received request from replica {} looking for a file", number_of_current_replica);
 
-    // 1. Check pre-queued files first
-    if (auto file = getPreQueuedFile(number_of_current_replica))
-        return file;
+    saveLastNodeActivity(number_of_current_replica);
 
-    // 2. Try to find a matching file from the iterator
-    if (auto file = getMatchingFileFromIterator(number_of_current_replica))
-        return file;
+    auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica);
+    if (processed_file_list_ptr == replica_to_files_to_be_processed.end())
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Replica number {} was marked as lost, can't set task for it anymore",
+            number_of_current_replica
+        );
 
+    // 1. Check pre-queued files first
+    auto file = getPreQueuedFile(number_of_current_replica);
+    // 2. Try to find a matching file from the iterator
+    if (!file)
+        file = getMatchingFileFromIterator(number_of_current_replica);
     // 3. Process unprocessed files if iterator is exhausted
-    return getAnyUnprocessedFile(number_of_current_replica);
+    if (!file)
+        file = getAnyUnprocessedFile(number_of_current_replica);
+
+    if (file)
+        processed_file_list_ptr->second.push_back(file);
+
+    return file;
 }
 
 size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String & file_path)
@@ -48,16 +79,27 @@ size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String
         return 0;
 
     /// Rendezvous hashing
-    size_t best_id = 0;
-    UInt64 best_weight = sipHash64(ids_of_nodes[0] + file_path);
-    for (size_t id = 1; id < nodes_count; ++id)
+    auto replica = replica_to_files_to_be_processed.begin();
+    if (replica == replica_to_files_to_be_processed.end())
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "No active replicas, can't find best replica for file {}",
+            file_path
+        );
+
+    size_t best_id = replica->first;
+    UInt64 best_weight = sipHash64(ids_of_nodes[best_id] + file_path);
+    ++replica;
+    while (replica != replica_to_files_to_be_processed.end())
     {
+        size_t id = replica->first;
         UInt64 weight = sipHash64(ids_of_nodes[id] + file_path);
         if (weight > best_weight)
         {
             best_weight = weight;
             best_id = id;
         }
+        ++replica;
     }
     return best_id;
 }
@@ -95,6 +137,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getPreQueuedFile(size_t
             number_of_current_replica
         );
 
+        ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica);
         return next_file;
     }
 
@@ -138,6 +181,17 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter
             file_identifier = object_info->getIdentifier();
         }
 
+        if (iceberg_read_optimization_enabled)
+        {
+            auto file_meta_info = object_info->relative_path_with_metadata.getFileMetaInfo();
+            if (file_meta_info.has_value())
+            {
+                auto file_path = send_over_whole_archive ? object_info->getPathOrPathToArchiveIfArchive() : object_info->getPath();
+                object_info->relative_path_with_metadata.command.setFilePath(file_path);
+                object_info->relative_path_with_metadata.command.setFileMetaInfo(file_meta_info.value());
+            }
+        }
+
         size_t file_replica_idx = getReplicaForFile(file_identifier);
         if (file_replica_idx == number_of_current_replica)
         {
@@ -146,6 +200,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter
                 file_identifier, number_of_current_replica
             );
 
+            ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica);
             return object_info;
         }
         LOG_TEST(
@@ -159,7 +214,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter
         // Queue file for its assigned replica
         {
             std::lock_guard lock(mutex);
-            unprocessed_files.emplace(file_identifier, object_info);
+            unprocessed_files.emplace(file_identifier, std::make_pair(object_info, file_replica_idx));
             connection_to_files[file_replica_idx].push_back(object_info);
         }
     }
@@ -169,26 +224,97 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter
 
 ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getAnyUnprocessedFile(size_t number_of_current_replica)
 {
+    /// Limit time of node activity to keep task in queue
+    Poco::Timestamp activity_limit;
+    Poco::Timestamp oldest_activity;
+    if (lock_object_storage_task_distribution_us > 0)
+        activity_limit -= lock_object_storage_task_distribution_us;
+
     std::lock_guard lock(mutex);
 
     if (!unprocessed_files.empty())
     {
         auto it = unprocessed_files.begin();
-        auto next_file = it->second;
-        unprocessed_files.erase(it);
 
-        auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath();
+        while (it != unprocessed_files.end())
+        {
+            auto number_of_matched_replica = it->second.second;
+            auto last_activity = last_node_activity.find(number_of_matched_replica);
+            if (lock_object_storage_task_distribution_us <= 0 // file deferring is turned off
+                || it->second.second == number_of_current_replica // file is matching with current replica
+                || last_activity == last_node_activity.end() // msut never be happen, last_activity is filled for each replica on start
+                || activity_limit > last_activity->second) // matched replica did not ask for a new files for a while
+            {
+                auto next_file = it->second.first;
+                unprocessed_files.erase(it);
+
+                auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath();
+                LOG_TRACE(
+                    log,
+                    "Iterator exhausted. Assigning unprocessed file {} to replica {} from matched replica {}",
+                    file_path,
+                    number_of_current_replica,
+                    number_of_matched_replica
+                );
+
+                ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToNonMatchedReplica);
+                return next_file;
+            }
+
+            oldest_activity = std::min(oldest_activity, last_activity->second);
+            ++it;
+        }
+
         LOG_TRACE(
             log,
-            "Iterator exhausted. Assigning unprocessed file {} to replica {}",
-            file_path,
-            number_of_current_replica
+            "No unprocessed file for replica {}, need to retry after {} us",
+            number_of_current_replica,
+            oldest_activity - activity_limit
         );
 
-        return next_file;
+        /// All unprocessed files owned by alive replicas with recenlty activity
+        /// Need to retry after (oldest_activity - activity_limit) microseconds
+        RelativePathWithMetadata::CommandInTaskResponse response;
+        response.setRetryAfterUs(oldest_activity - activity_limit);
+        return std::make_shared<ObjectInfo>(response.toString());
     }
 
     return {};
 }
 
+void StorageObjectStorageStableTaskDistributor::saveLastNodeActivity(size_t number_of_current_replica)
+{
+    Poco::Timestamp now;
+    std::lock_guard lock(mutex);
+    last_node_activity[number_of_current_replica] = now;
+}
+
+void StorageObjectStorageStableTaskDistributor::rescheduleTasksFromReplica(size_t number_of_current_replica)
+{
+    LOG_INFO(log, "Replica {} is marked as lost, tasks are returned to queue", number_of_current_replica);
+    std::lock_guard lock(mutex);
+
+    auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica);
+    if (processed_file_list_ptr == replica_to_files_to_be_processed.end())
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Replica number {} was marked as lost already",
+            number_of_current_replica
+        );
+
+    if (replica_to_files_to_be_processed.size() < 2)
+        throw Exception(
+            ErrorCodes::CANNOT_READ_ALL_DATA,
+            "All replicas were marked as lost"
+        );
+
+    for (const auto & file : processed_file_list_ptr->second)
+    {
+        auto file_replica_idx = getReplicaForFile(file->getPath());
+        unprocessed_files.emplace(file->getPath(), std::make_pair(file, file_replica_idx));
+        connection_to_files[file_replica_idx].push_back(file);
+    }
+    replica_to_files_to_be_processed.erase(number_of_current_replica);
+}
+
 }
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h
index 02d3ba7a030f..0cd10ac7188e 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h
+++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h
@@ -4,7 +4,13 @@
 #include <Common/Logger.h>
 #include <Interpreters/Cluster.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Storages/ObjectStorageQueue/ObjectStorageQueueSource.h>
+
+#include <Poco/Timestamp.h>
+
 #include <unordered_set>
+#include <unordered_map>
+#include <list>
 #include <vector>
 #include <mutex>
 #include <memory>
@@ -18,26 +24,38 @@ class StorageObjectStorageStableTaskDistributor
     StorageObjectStorageStableTaskDistributor(
         std::shared_ptr<IObjectIterator> iterator_,
         std::vector<std::string> && ids_of_nodes_,
-        bool send_over_whole_archive_);
+        bool send_over_whole_archive_,
+        uint64_t lock_object_storage_task_distribution_ms_,
+        bool iceberg_read_optimization_enabled_);
 
     ObjectInfoPtr getNextTask(size_t number_of_current_replica);
 
+    /// Insert objects back to unprocessed files
+    void rescheduleTasksFromReplica(size_t number_of_current_replica);
+
 private:
     size_t getReplicaForFile(const String & file_path);
     ObjectInfoPtr getPreQueuedFile(size_t number_of_current_replica);
     ObjectInfoPtr getMatchingFileFromIterator(size_t number_of_current_replica);
     ObjectInfoPtr getAnyUnprocessedFile(size_t number_of_current_replica);
 
+    void saveLastNodeActivity(size_t number_of_current_replica);
+
     const std::shared_ptr<IObjectIterator> iterator;
     const bool send_over_whole_archive;
 
     std::vector<std::vector<ObjectInfoPtr>> connection_to_files;
-    std::unordered_map<std::string, ObjectInfoPtr> unprocessed_files;
+    std::unordered_map<std::string, std::pair<ObjectInfoPtr, size_t>> unprocessed_files;
 
     std::vector<std::string> ids_of_nodes;
 
+    std::unordered_map<size_t, Poco::Timestamp> last_node_activity;
+    Poco::Timestamp::TimeDiff lock_object_storage_task_distribution_us;
+    std::unordered_map<size_t, std::list<ObjectInfoPtr>> replica_to_files_to_be_processed;
+
     std::mutex mutex;
     bool iterator_exhausted = false;
+    bool iceberg_read_optimization_enabled = false;
 
     LoggerPtr log = getLogger("StorageClusterTaskDistributor");
 };
diff --git a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp
index b00c1d609fa1..c9fc2831b214 100644
--- a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp
+++ b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp
@@ -101,7 +101,7 @@ TEST(RendezvousHashing, SingleNode)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica0", "replica1", "replica2", "replica3"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10));
         ASSERT_TRUE(checkHead(paths, {6}));
@@ -110,7 +110,7 @@ TEST(RendezvousHashing, SingleNode)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica0", "replica1", "replica2", "replica3"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10));
         ASSERT_TRUE(checkHead(paths, {0, 2, 4}));
@@ -119,7 +119,7 @@ TEST(RendezvousHashing, SingleNode)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica0", "replica1", "replica2", "replica3"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 2, 10));
         ASSERT_TRUE(checkHead(paths, {1, 5, 7, 8}));
@@ -128,7 +128,7 @@ TEST(RendezvousHashing, SingleNode)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica0", "replica1", "replica2", "replica3"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 3, 10));
         ASSERT_TRUE(checkHead(paths, {3, 9}));
@@ -139,7 +139,7 @@ TEST(RendezvousHashing, MultipleNodes)
 {
     auto iterator = makeIterator();
     std::vector<std::string> replicas = {"replica0", "replica1", "replica2", "replica3"};
-    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
 
     {
         std::vector<std::string> paths;
@@ -171,7 +171,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica2", "replica1"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10));
         ASSERT_TRUE(checkHead(paths, {1, 5, 6, 7, 8, 9}));
@@ -180,7 +180,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster)
     {
         auto iterator = makeIterator();
         std::vector<std::string> replicas = {"replica2", "replica1"};
-        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+        StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
         std::vector<std::string> paths;
         ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10));
         ASSERT_TRUE(checkHead(paths, {0, 2, 3, 4}));
@@ -191,7 +191,7 @@ TEST(RendezvousHashing, MultipleNodesReducedCluster)
 {
     auto iterator = makeIterator();
     std::vector<std::string> replicas = {"replica2", "replica1"};
-    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
 
     {
         std::vector<std::string> paths;
@@ -210,7 +210,7 @@ TEST(RendezvousHashing, MultipleNodesReducedClusterOneByOne)
 {
     auto iterator = makeIterator();
     std::vector<std::string> replicas = {"replica2", "replica1"};
-    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false);
+    StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0, false);
 
     std::vector<std::string> paths0;
     std::vector<std::string> paths1;
diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp
index 085cdb0572c3..d94d87412e0e 100644
--- a/src/Storages/StorageFileCluster.cpp
+++ b/src/Storages/StorageFileCluster.cpp
@@ -95,18 +95,49 @@ void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const Sto
     );
 }
 
-RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(
-    const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const
+class FileTaskIterator : public TaskIterator
 {
-    auto iterator = std::make_shared<StorageFileSource::FilesIterator>(paths, std::nullopt, predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context);
-    auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr
+public:
+    FileTaskIterator(const Strings & files,
+        std::optional<StorageFile::ArchiveInfo> archive_info,
+        const ActionsDAG::Node * predicate,
+        const NamesAndTypesList & virtual_columns,
+        const NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
+        const ContextPtr & context,
+        bool distributed_processing = false)
+        : iterator(files
+            , archive_info
+            , predicate
+            , virtual_columns
+            , hive_partition_columns_to_read_from_file_path
+            , context
+            , distributed_processing) {}
+
+    ~FileTaskIterator() override = default;
+
+    ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override
     {
-        auto file = iter->next();
+        auto file = iterator.next();
         if (file.empty())
             return std::make_shared<ClusterFunctionReadTaskResponse>();
         return std::make_shared<ClusterFunctionReadTaskResponse>(std::move(file));
-    };
-    auto callback = std::make_shared<TaskIterator>(std::move(next_callback));
+    }
+
+private:
+    mutable StorageFileSource::FilesIterator iterator;
+};
+
+RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(
+    const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const
+{
+    auto callback = std::make_shared<FileTaskIterator>(
+        paths,
+        std::nullopt,
+        predicate,
+        getVirtualsList(),
+        hive_partition_columns_to_read_from_file_path,
+        context
+    );
     return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)};
 }
 
diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp
index 18ed46e9e005..867c6831deac 100644
--- a/src/Storages/StorageURLCluster.cpp
+++ b/src/Storages/StorageURLCluster.cpp
@@ -130,20 +130,42 @@ void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageS
     );
 }
 
-RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(
-    const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const
+class UrlTaskIterator : public TaskIterator
 {
-    auto iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(
-        uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context);
-
-    auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr
+public:
+    UrlTaskIterator(const String & uri,
+        size_t max_addresses,
+        const ActionsDAG::Node * predicate,
+        const NamesAndTypesList & virtual_columns,
+        const NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
+        const ContextPtr & context)
+        : iterator(uri, max_addresses, predicate, virtual_columns, hive_partition_columns_to_read_from_file_path, context) {}
+
+    ~UrlTaskIterator() override = default;
+
+    ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override
     {
-        auto url = iter->next();
+        auto url = iterator.next();
         if (url.empty())
             return std::make_shared<ClusterFunctionReadTaskResponse>();
         return std::make_shared<ClusterFunctionReadTaskResponse>(std::move(url));
-    };
-    auto callback = std::make_shared<TaskIterator>(std::move(next_callback));
+    }
+
+private:
+    mutable StorageURLSource::DisclosedGlobIterator iterator;
+};
+
+RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(
+    const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const
+{
+    auto callback = std::make_shared<UrlTaskIterator>(
+        uri,
+        context->getSettingsRef()[Setting::glob_expansion_max_elements],
+        predicate,
+        getVirtualsList(),
+        hive_partition_columns_to_read_from_file_path,
+        context
+    );
     return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)};
 }
 
diff --git a/tests/integration/test_s3_cache_locality/__init__.py b/tests/integration/test_s3_cache_locality/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/integration/test_s3_cache_locality/configs/cluster.xml b/tests/integration/test_s3_cache_locality/configs/cluster.xml
new file mode 100644
index 000000000000..db54c35374b9
--- /dev/null
+++ b/tests/integration/test_s3_cache_locality/configs/cluster.xml
@@ -0,0 +1,126 @@
+
+<clickhouse>
+    <remote_servers>
+
+        <cluster_12345>
+            <shard>
+                <replica>
+                    <host>clickhouse1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse3</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse5</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_12345>
+
+        <cluster_1234>
+            <shard>
+                <replica>
+                    <host>clickhouse1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse3</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse4</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_1234>
+
+        <cluster_2345>
+            <shard>
+                <replica>
+                    <host>clickhouse2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse3</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse5</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_2345>
+
+        <cluster_34512>
+            <shard>
+                <replica>
+                    <host>clickhouse3</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse5</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_34512>
+
+        <cluster_4523>
+            <shard>
+                <replica>
+                    <host>clickhouse4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse5</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>clickhouse3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_4523>
+
+    </remote_servers>
+
+    <filesystem_caches>
+        <raw_s3_cache>
+            <path>/var/lib/clickhouse/raw_s3_cache</path>
+            <max_size>10Gi</max_size>
+        </raw_s3_cache>
+    </filesystem_caches>
+</clickhouse>
diff --git a/tests/integration/test_s3_cache_locality/configs/named_collections.xml b/tests/integration/test_s3_cache_locality/configs/named_collections.xml
new file mode 100644
index 000000000000..6994aa3f5e77
--- /dev/null
+++ b/tests/integration/test_s3_cache_locality/configs/named_collections.xml
@@ -0,0 +1,10 @@
+<clickhouse>
+    <named_collections>
+        <test_s3>
+            <url>http://minio1:9001/root/data/*</url>
+            <access_key_id>minio</access_key_id>
+            <secret_access_key>ClickHouse_Minio_P@ssw0rd</secret_access_key>
+            <format>CSV</format>>
+        </test_s3>
+    </named_collections>
+</clickhouse>
diff --git a/tests/integration/test_s3_cache_locality/configs/users.xml b/tests/integration/test_s3_cache_locality/configs/users.xml
new file mode 100644
index 000000000000..4b6ba057ecb1
--- /dev/null
+++ b/tests/integration/test_s3_cache_locality/configs/users.xml
@@ -0,0 +1,9 @@
+<clickhouse>
+    <users>
+        <default>
+            <password></password>
+            <profile>default</profile>
+            <named_collection_control>1</named_collection_control>
+        </default>
+    </users>
+</clickhouse>
diff --git a/tests/integration/test_s3_cache_locality/test.py b/tests/integration/test_s3_cache_locality/test.py
new file mode 100644
index 000000000000..68993d85aeed
--- /dev/null
+++ b/tests/integration/test_s3_cache_locality/test.py
@@ -0,0 +1,262 @@
+import csv
+import logging
+import os
+import shutil
+import uuid
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.config_cluster import minio_secret_key
+
+
+logging.getLogger().setLevel(logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler())
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+
+def create_buckets_s3(cluster, files=1000):
+    minio = cluster.minio_client
+
+    s3_data = []
+
+    for file_number in range(files):
+        file_name = f"data/generated_{files}/file_{file_number}.csv"
+        os.makedirs(os.path.join(SCRIPT_DIR, f"data/generated_{files}/"), exist_ok=True)
+        s3_data.append(file_name)
+        with open(os.path.join(SCRIPT_DIR, file_name), "w+", encoding="utf-8") as f:
+            # a String, b UInt64
+            data = []
+
+            # Make all files a bit different
+            data.append(
+                ["str_" + str(file_number), file_number]
+            )
+
+            writer = csv.writer(f)
+            writer.writerows(data)
+
+    for file in s3_data:
+        minio.fput_object(
+            bucket_name=cluster.minio_bucket,
+            object_name=file,
+            file_path=os.path.join(SCRIPT_DIR, file),
+        )
+
+    for obj in minio.list_objects(cluster.minio_bucket, recursive=True):
+        print(obj.object_name)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster = ClickHouseCluster(__file__)
+        # clickhouse0 not a member of cluster_XXX
+        for i in range(6):
+            cluster.add_instance(
+                f"clickhouse{i}",
+                main_configs=["configs/cluster.xml", "configs/named_collections.xml"],
+                user_configs=["configs/users.xml"],
+                macros={"replica": f"clickhouse{i}"},
+                with_minio=True,
+                with_zookeeper=True,
+                stay_alive=True,
+            )
+
+        logging.info("Starting cluster...")
+        cluster.start()
+        logging.info("Cluster started")
+
+        create_buckets_s3(cluster)
+        create_buckets_s3(cluster, files=3)
+
+        yield cluster
+    finally:
+        shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_1000/"), ignore_errors=True)
+        shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_3/"), ignore_errors=True)
+        cluster.shutdown()
+
+
+def check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache,
+                  lock_object_storage_task_distribution_ms, files=1000):
+    for host in list(cluster.instances.values()):
+        host.query("SYSTEM DROP FILESYSTEM CACHE 'raw_s3_cache'", ignore_error=True)
+
+    settings = {
+        "enable_filesystem_cache": enable_filesystem_cache,
+        "filesystem_cache_name": "'raw_s3_cache'",
+        }
+
+    settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms
+
+    query_id_first = str(uuid.uuid4())
+    result_first = node.query(
+        f"""
+        SELECT count(*)
+          FROM s3Cluster('{cluster_first}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64')
+          WHERE b=42
+        SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())}
+        """,
+        query_id=query_id_first,
+    )
+    assert result_first == expected_result
+    query_id_second = str(uuid.uuid4())
+    result_second = node.query(
+        f"""
+        SELECT count(*)
+          FROM s3Cluster('{cluster_second}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64')
+          WHERE b=42
+        SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())}
+        """,
+        query_id=query_id_second,
+    )
+    assert result_second == expected_result
+
+    node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_first}")
+    node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_second}")
+
+    s3_get_first = node.query(
+        f"""
+        SELECT sum(ProfileEvents['S3GetObject'])
+          FROM clusterAllReplicas('{cluster_first}', system.query_log)
+          WHERE type='QueryFinish'
+            AND initial_query_id='{query_id_first}'
+        """,
+    )
+    s3_get_second = node.query(
+        f"""
+        SELECT sum(ProfileEvents['S3GetObject'])
+          FROM clusterAllReplicas('{cluster_second}', system.query_log)
+          WHERE type='QueryFinish'
+            AND initial_query_id='{query_id_second}'
+        """,
+    )
+
+    return int(s3_get_first), int(s3_get_second)
+
+
+def check_s3_gets_by_hosts(cluster, node, expected_result,
+                  lock_object_storage_task_distribution_ms, files=1000):
+    settings = {
+        "enable_filesystem_cache": False,
+        }
+
+    settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms
+    query_id = str(uuid.uuid4())
+    result = node.query(
+        f"""
+        SELECT count(*)
+          FROM s3Cluster('{cluster}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64')
+          WHERE b=42
+        SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())}
+        """,
+        query_id=query_id,
+    )
+    assert result == expected_result
+
+    node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster}")
+
+    s3_get = node.query(
+        f"""
+        SELECT ProfileEvents['S3GetObject']
+          FROM clusterAllReplicas('{cluster}', system.query_log)
+          WHERE type='QueryFinish'
+            AND initial_query_id='{query_id}'
+          ORDER BY hostname
+        """,
+    )
+
+    return [int(events) for events in s3_get.strip().split("\n")]
+
+
+def check_s3_gets_repeat(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache,
+                         lock_object_storage_task_distribution_ms):
+    # Repeat test several times to get average result
+    iterations = 1 if lock_object_storage_task_distribution_ms > 0 else 10
+    s3_get_first_sum = 0
+    s3_get_second_sum = 0
+    for _ in range(iterations):
+        (s3_get_first, s3_get_second) = check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, lock_object_storage_task_distribution_ms)
+        s3_get_first_sum += s3_get_first
+        s3_get_second_sum += s3_get_second
+    return s3_get_first_sum, s3_get_second_sum
+
+
+@pytest.mark.parametrize("lock_object_storage_task_distribution_ms ", [0, 30000])
+def test_cache_locality(started_cluster, lock_object_storage_task_distribution_ms):
+    node = started_cluster.instances["clickhouse0"]
+
+    expected_result = node.query(
+        f"""
+        SELECT count(*)
+          FROM s3('http://minio1:9001/root/data/generated_1000/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64')
+          WHERE b=42
+        """
+    )
+
+    # Algorithm does not give 100% guarantee, so add 10% on dispersion
+    dispersion = 0.0 if lock_object_storage_task_distribution_ms > 0 else 0.1
+
+    # No cache
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 0, lock_object_storage_task_distribution_ms)
+    assert s3_get_second == s3_get_first
+
+    # With cache
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * dispersion
+
+    # Different replicas order
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_34512', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * dispersion
+
+    # No last replica
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_1234', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * (0.179 + dispersion) # actual value - 179 of 1000 files changed replica
+
+    # No first replica
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_2345', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * (0.189 + dispersion) # actual value - 189 of 1000 files changed replica
+
+    # No first replica, different replicas order
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_4523', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * (0.189 + dispersion)
+
+    # Add new replica, different replicas order
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_4523', 'cluster_12345', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * (0.189 + dispersion)
+
+    # New replica and old replica, different replicas order
+    # All files from removed replica changed replica
+    # Some files from existed replicas changed replica on the new replica
+    (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_1234', 'cluster_4523', 1, lock_object_storage_task_distribution_ms)
+    assert s3_get_second <= s3_get_first * (0.368 + dispersion) # actual value - 368 of 1000 changed replica
+
+    if (lock_object_storage_task_distribution_ms > 0):
+        s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000)
+        assert s3_get == [189,210,220,202,179]
+        s3_get = check_s3_gets_by_hosts('cluster_1234', node, expected_result, lock_object_storage_task_distribution_ms, files=1000)
+        assert s3_get == [247,243,264,246]
+        s3_get = check_s3_gets_by_hosts('cluster_2345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000)
+        assert s3_get == [251,280,248,221]
+
+
+def test_cache_locality_few_files(started_cluster):
+    node = started_cluster.instances["clickhouse0"]
+
+    expected_result = node.query(
+        f"""
+        SELECT count(*)
+          FROM s3('http://minio1:9001/root/data/generated_3/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64')
+          WHERE b=42
+        """
+    )
+
+    # Rendezvous hash makes the next distribution:
+    # file_0 - clickhouse1
+    # file_1 - clickhouse4
+    # file_2 - clickhouse3
+    # The same distribution must be in each query
+    for _ in range(10):
+        s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms=30000, files=3)
+        assert s3_get == [1,0,1,1,0]
diff --git a/tests/integration/test_s3_cluster/data/graceful/part0.csv b/tests/integration/test_s3_cluster/data/graceful/part0.csv
new file mode 100644
index 000000000000..2a8ceabbea58
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part0.csv
@@ -0,0 +1 @@
+0,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part1.csv b/tests/integration/test_s3_cluster/data/graceful/part1.csv
new file mode 100644
index 000000000000..1950012fffd2
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part1.csv
@@ -0,0 +1 @@
+1,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part2.csv b/tests/integration/test_s3_cluster/data/graceful/part2.csv
new file mode 100644
index 000000000000..dc782d5adf9b
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part2.csv
@@ -0,0 +1 @@
+2,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part3.csv b/tests/integration/test_s3_cluster/data/graceful/part3.csv
new file mode 100644
index 000000000000..6e581549d23c
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part3.csv
@@ -0,0 +1 @@
+3,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part4.csv b/tests/integration/test_s3_cluster/data/graceful/part4.csv
new file mode 100644
index 000000000000..bb5a4d956c51
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part4.csv
@@ -0,0 +1 @@
+4,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part5.csv b/tests/integration/test_s3_cluster/data/graceful/part5.csv
new file mode 100644
index 000000000000..5cb2c6be144b
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part5.csv
@@ -0,0 +1 @@
+5,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part6.csv b/tests/integration/test_s3_cluster/data/graceful/part6.csv
new file mode 100644
index 000000000000..e2e2428d100d
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part6.csv
@@ -0,0 +1 @@
+6,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part7.csv b/tests/integration/test_s3_cluster/data/graceful/part7.csv
new file mode 100644
index 000000000000..3c819a315c20
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part7.csv
@@ -0,0 +1 @@
+7,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part8.csv b/tests/integration/test_s3_cluster/data/graceful/part8.csv
new file mode 100644
index 000000000000..72f39e512be3
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part8.csv
@@ -0,0 +1 @@
+8,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/part9.csv b/tests/integration/test_s3_cluster/data/graceful/part9.csv
new file mode 100644
index 000000000000..f288cb2051dd
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/part9.csv
@@ -0,0 +1 @@
+9,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partA.csv b/tests/integration/test_s3_cluster/data/graceful/partA.csv
new file mode 100644
index 000000000000..da99f68ba784
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partA.csv
@@ -0,0 +1 @@
+10,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partB.csv b/tests/integration/test_s3_cluster/data/graceful/partB.csv
new file mode 100644
index 000000000000..46591e0be815
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partB.csv
@@ -0,0 +1 @@
+11,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partC.csv b/tests/integration/test_s3_cluster/data/graceful/partC.csv
new file mode 100644
index 000000000000..24af8010b5c6
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partC.csv
@@ -0,0 +1 @@
+12,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partD.csv b/tests/integration/test_s3_cluster/data/graceful/partD.csv
new file mode 100644
index 000000000000..0365a5024871
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partD.csv
@@ -0,0 +1 @@
+13,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partE.csv b/tests/integration/test_s3_cluster/data/graceful/partE.csv
new file mode 100644
index 000000000000..3143c0eed915
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partE.csv
@@ -0,0 +1 @@
+14,"Foo"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/data/graceful/partF.csv b/tests/integration/test_s3_cluster/data/graceful/partF.csv
new file mode 100644
index 000000000000..d0306b9bb806
--- /dev/null
+++ b/tests/integration/test_s3_cluster/data/graceful/partF.csv
@@ -0,0 +1 @@
+15,"Bar"
\ No newline at end of file
diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py
index 76b8f0df2881..9c817209b8ba 100644
--- a/tests/integration/test_s3_cluster/test.py
+++ b/tests/integration/test_s3_cluster/test.py
@@ -2,11 +2,13 @@
 import logging
 import os
 import shutil
+import threading
 import time
 from email.errors import HeaderParseError
 
 import pytest
 
+from helpers.client import QueryRuntimeException
 from helpers.cluster import ClickHouseCluster
 from helpers.config_cluster import minio_secret_key
 from helpers.mock_servers import start_mock_servers
@@ -21,6 +23,22 @@
     "data/clickhouse/part123.csv",
     "data/database/part2.csv",
     "data/database/partition675.csv",
+    "data/graceful/part0.csv",
+    "data/graceful/part1.csv",
+    "data/graceful/part2.csv",
+    "data/graceful/part3.csv",
+    "data/graceful/part4.csv",
+    "data/graceful/part5.csv",
+    "data/graceful/part6.csv",
+    "data/graceful/part7.csv",
+    "data/graceful/part8.csv",
+    "data/graceful/part9.csv",
+    "data/graceful/partA.csv",
+    "data/graceful/partB.csv",
+    "data/graceful/partC.csv",
+    "data/graceful/partD.csv",
+    "data/graceful/partE.csv",
+    "data/graceful/partF.csv",
 ]
 
 
@@ -76,6 +94,7 @@ def started_cluster():
             macros={"replica": "node1", "shard": "shard1"},
             with_minio=True,
             with_zookeeper=True,
+            stay_alive=True,
         )
         cluster.add_instance(
             "s0_0_1",
@@ -83,6 +102,7 @@ def started_cluster():
             user_configs=["configs/users.xml"],
             macros={"replica": "replica2", "shard": "shard1"},
             with_zookeeper=True,
+            stay_alive=True,
         )
         cluster.add_instance(
             "s0_1_0",
@@ -90,6 +110,7 @@ def started_cluster():
             user_configs=["configs/users.xml"],
             macros={"replica": "replica1", "shard": "shard2"},
             with_zookeeper=True,
+            stay_alive=True,
         )
 
         logging.info("Starting cluster...")
@@ -509,3 +530,63 @@ def test_cluster_default_expression(started_cluster):
     )
 
     assert result == expected_result
+
+
+def test_graceful_shutdown(started_cluster):
+    node = started_cluster.instances["s0_0_0"]
+    node_to_shutdown = started_cluster.instances["s0_1_0"]
+
+    expected = TSV("64\tBar\t8\n56\tFoo\t8\n")
+
+    num_lock = threading.Lock()
+    errors = 0
+
+    def query_cycle():
+        nonlocal errors
+        try:
+            i = 0
+            while i < 10:
+                i += 1
+                # Query time 3-4 seconds
+                # Processing single object 1-2 seconds
+                result = node.query(f"""
+                    SELECT sum(value),name,sum(sleep(1)+1) as sleep FROM s3Cluster(
+                        'cluster_simple',
+                        'http://minio1:9001/root/data/graceful/*', 'minio', '{minio_secret_key}', 'CSV',
+                        'value UInt32, name String')
+                    GROUP BY name
+                    ORDER BY name
+                    SETTINGS max_threads=2
+                    """)
+                with num_lock:
+                    if TSV(result) != expected:
+                        errors += 1
+                    if errors >= 1:
+                        break
+        except QueryRuntimeException:
+            with num_lock:
+                errors += 1
+
+    threads = []
+
+    for _ in range(10):
+        thread = threading.Thread(target=query_cycle)
+        thread.start()
+        threads.append(thread)
+        time.sleep(0.2)
+
+    time.sleep(3)
+
+    node_to_shutdown.query("SYSTEM STOP SWARM MODE")
+
+    # enough time to complete processing of objects, started before "SYSTEM STOP SWARM MODE"
+    time.sleep(3)
+
+    node_to_shutdown.stop_clickhouse(kill=True)
+
+    for thread in threads:
+        thread.join()
+
+    node_to_shutdown.start_clickhouse()
+
+    assert errors == 0
diff --git a/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py
new file mode 100644
index 000000000000..bdca97add2e5
--- /dev/null
+++ b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py
@@ -0,0 +1,215 @@
+import pytest
+
+from helpers.iceberg_utils import (
+    get_uuid_str,
+    get_creation_expression,
+    execute_spark_query_general,
+)
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure"])
+@pytest.mark.parametrize("run_on_cluster", [False, True])
+def test_read_constant_columns_optimization(started_cluster_iceberg_with_spark, storage_type, run_on_cluster):
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_read_constant_columns_optimization_" + storage_type + "_" + get_uuid_str()
+
+    def execute_spark_query(query: str):
+        return execute_spark_query_general(
+            spark,
+            started_cluster_iceberg_with_spark,
+            storage_type,
+            TABLE_NAME,
+            query,
+        )
+
+    execute_spark_query(
+        f"""
+            CREATE TABLE {TABLE_NAME} (
+                tag INT,
+                date DATE,
+                date2 DATE,
+                name VARCHAR(50),
+                number BIGINT
+            )
+            USING iceberg
+            PARTITIONED BY (identity(tag), years(date))
+            OPTIONS('format-version'='2')
+        """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (1, DATE '2024-01-20', DATE '2024-01-20', 'vasya', 5),
+        (1, DATE '2024-01-20', DATE '2024-01-20', 'vasilisa', 5),
+        (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5),
+        (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5),
+        (2, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 5),
+        (2, DATE '2025-01-21', DATE '2025-01-20', 'vasilisa', 5)
+    """
+    )
+
+    execute_spark_query(
+        f"""
+            ALTER TABLE {TABLE_NAME} ALTER COLUMN number FIRST;
+        """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (5, 3, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa'),
+        (5, 3, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa')
+    """
+    )
+
+    execute_spark_query(
+        f"""
+            ALTER TABLE {TABLE_NAME} RENAME COLUMN name TO name_old;
+        """
+    )
+
+    execute_spark_query(
+        f"""
+            ALTER TABLE {TABLE_NAME}
+            ADD COLUMNS (
+                name string
+            );
+        """
+    )
+
+    execute_spark_query(
+        f"""
+        INSERT INTO {TABLE_NAME} VALUES
+        (5, 4, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'),
+        (5, 4, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 'iceberg'),
+        (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'),
+        (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa', 'icebreaker'),
+        (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'),
+        (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg')
+    """
+    )
+
+    # Totally must be 7 files
+    # Partitioned column 'tag' is constant in each file
+    # Column 'date' is constant in 6 files, has different values in (2-2025)
+    # Column 'date2' is constant in 4 files (1-2024, 2-2025, 5-2025, 6-2025)
+    # Column 'name_old' is constant in 3 files (1-2025, 2-2025 as 'name', 6-2025 as 'name_old')
+    # Column 'number' is globally constant
+    # New column 'name2' is present only in 3 files (4-2025, 5-2025, 6-2025), constant in two (4-2025, 6-2025)
+    # Files 1-2025 and 6-2025 have only constant columns
+
+    creation_expression = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True, run_on_cluster=run_on_cluster
+    )
+
+    # Warm up metadata cache
+    for replica in started_cluster_iceberg_with_spark.instances.values():
+        replica.query(f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0")
+
+    all_data_expected_query_id = get_uuid_str()
+    all_data_expected = instance.query(
+        f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0",
+        query_id=all_data_expected_query_id,
+        )
+    const_only_expected_query_id = get_uuid_str()
+    const_only_expected = instance.query(
+        f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0",
+        query_id=const_only_expected_query_id,
+        )
+    const_partial_expected_query_id = get_uuid_str()
+    const_partial_expected = instance.query(
+        f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0",
+        query_id=const_partial_expected_query_id,
+        )
+    const_partial2_expected_query_id = get_uuid_str()
+    const_partial2_expected = instance.query(
+        f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0",
+        query_id=const_partial2_expected_query_id,
+        )
+    count_expected_query_id = get_uuid_str()
+    count_expected = instance.query(
+        f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0",
+        query_id=count_expected_query_id,
+        )
+
+    all_data_query_id = get_uuid_str()
+    all_data_optimized = instance.query(
+        f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1",
+        query_id=all_data_query_id,
+        )
+    const_only_query_id = get_uuid_str()
+    const_only_optimized = instance.query(
+        f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1",
+        query_id=const_only_query_id,
+        )
+    const_partial_query_id = get_uuid_str()
+    const_partial_optimized = instance.query(
+        f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1",
+        query_id=const_partial_query_id,
+        )
+    const_partial2_query_id = get_uuid_str()
+    const_partial2_optimized = instance.query(
+        f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1",
+        query_id=const_partial2_query_id,
+        )
+    count_query_id = get_uuid_str()
+    count_optimized = instance.query(
+        f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1",
+        query_id=count_query_id,
+        )
+
+    assert all_data_expected == all_data_optimized
+    assert const_only_expected == const_only_optimized
+    assert const_partial_expected == const_partial_optimized
+    assert const_partial2_expected == const_partial2_optimized
+    assert count_expected == count_optimized
+
+    for replica in started_cluster_iceberg_with_spark.instances.values():
+        replica.query("SYSTEM FLUSH LOGS")
+
+    def check_events(query_id, event, is_cluster, expected):
+        res = instance.query(
+            f"""
+            SELECT
+                sum(tupleElement(arrayJoin(ProfileEvents),2)) as value
+            FROM
+                clusterAllReplicas('cluster_simple', system.query_log)
+            WHERE
+                type='QueryFinish'
+                AND tupleElement(arrayJoin(ProfileEvents),1)='{event}'
+                AND initial_query_id='{query_id}'
+            GROUP BY ALL
+            FORMAT CSV
+            """)
+        # Weird, bu looks like ReadFileMetadata does not used local file cache in 26.1
+        # metadata.json always downloaded in 26.1, once per query or subquery
+        # In 25.8 count was equal to expected, in 26.1 it is expected * 3 + 1 for Local case
+        # expected * 3 + 4 for Cluster case, because each subquery loads mettadata.json
+        assert int(res) == expected * 3 + (4 if is_cluster else 1)
+
+    event = "S3GetObject" if storage_type == "s3" else "AzureGetObject"
+
+    # Without optimization clickhouse reads all 7 files
+    check_events(all_data_expected_query_id, event, run_on_cluster, 7)
+    check_events(const_only_expected_query_id, event, run_on_cluster, 7)
+    check_events(const_partial_expected_query_id, event, run_on_cluster, 7)
+    check_events(const_partial2_expected_query_id, event, run_on_cluster, 7)
+    check_events(count_expected_query_id, event, run_on_cluster, 7)
+
+    # If file has only constant columns it is not read
+    check_events(all_data_query_id, event, run_on_cluster, 5) # 1-2025, 6-2025 must not be read
+    check_events(const_only_query_id, event, run_on_cluster, 0) # All must not be read
+    check_events(const_partial_query_id, event, run_on_cluster, 4) # 1-2025, 6-2025 and 2-2025 must not be read
+    check_events(const_partial2_query_id, event, run_on_cluster, 3) # 6-2025 must not be read, 1-2024, 1-2025, 2-2025 don't have new column 'name'
+    check_events(count_query_id, event, run_on_cluster, 0) # All must not be read
+
+    def compare_selects(query):
+        result_expected = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=0")
+        result_optimized = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=1")
+        assert result_expected == result_optimized
+
+    compare_selects(f"SELECT _path,* FROM {creation_expression} ORDER BY ALL")
+    compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE name_old='vasily' ORDER BY ALL")
+    compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE ((tag + length(name_old)) % 2 = 1) ORDER BY ALL")
\ No newline at end of file
diff --git a/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml
new file mode 100644
index 000000000000..a8239a28293c
--- /dev/null
+++ b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml
@@ -0,0 +1,7 @@
+<clickhouse>
+    <profiles>
+        <default>
+            <lock_object_storage_task_distribution_ms>0</lock_object_storage_task_distribution_ms>
+        </default>
+    </profiles>
+</clickhouse>
diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py
index 021e73d39235..2ab756654d7b 100644
--- a/tests/integration/test_storage_s3/test.py
+++ b/tests/integration/test_storage_s3/test.py
@@ -79,6 +79,7 @@ def started_cluster():
                 "configs/users.xml",
                 "configs/s3_retry.xml",
                 "configs/sync_insert.xml",
+                "configs/lock_object_storage_task_distribution_ms.xml",
             ],
         )
         cluster.add_instance(
@@ -136,6 +137,7 @@ def started_cluster():
                 "configs/s3_retry.xml",
                 "configs/process_archives_as_whole_with_cluster.xml",
                 "configs/sync_insert.xml",
+                "configs/lock_object_storage_task_distribution_ms.xml",
             ],
         )
         cluster.add_instance(
diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference
index 253029f72e25..ec8c1b6aa923 100644
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@@ -162,6 +162,7 @@ SYSTEM MERGES	['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP MERGES','START M
 SYSTEM TTL MERGES	['SYSTEM STOP TTL MERGES','SYSTEM START TTL MERGES','STOP TTL MERGES','START TTL MERGES']	TABLE	SYSTEM
 SYSTEM FETCHES	['SYSTEM STOP FETCHES','SYSTEM START FETCHES','STOP FETCHES','START FETCHES']	TABLE	SYSTEM
 SYSTEM MOVES	['SYSTEM STOP MOVES','SYSTEM START MOVES','STOP MOVES','START MOVES']	TABLE	SYSTEM
+SYSTEM SWARM	['SYSTEM STOP SWARM MODE','SYSTEM START SWARM MODE','STOP SWARM MODE','START SWARM MODE']	GLOBAL	SYSTEM
 SYSTEM PULLING REPLICATION LOG	['SYSTEM STOP PULLING REPLICATION LOG','SYSTEM START PULLING REPLICATION LOG']	TABLE	SYSTEM
 SYSTEM CLEANUP	['SYSTEM STOP CLEANUP','SYSTEM START CLEANUP']	TABLE	SYSTEM
 SYSTEM VIEWS	['SYSTEM REFRESH VIEW','SYSTEM START VIEWS','SYSTEM STOP VIEWS','SYSTEM START VIEW','SYSTEM STOP VIEW','SYSTEM CANCEL VIEW','REFRESH VIEW','START VIEWS','STOP VIEWS','START VIEW','STOP VIEW','CANCEL VIEW']	VIEW	SYSTEM
diff --git a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql
index 718eb63ad923..26048ffe5694 100644
--- a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql
+++ b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql
@@ -4,5 +4,9 @@
 
 -- However, some settings in the experimental tier are meant to control another experimental feature, and then they can be enabled as long as the feature itself is disabled.
 -- These are in the exceptions list inside NOT IN.
-SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('throw_on_unsupported_query_inside_transaction');
+SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN (
+  'throw_on_unsupported_query_inside_transaction',
+-- turned ON for Altinity Antalya builds specifically
+  'allow_experimental_iceberg_read_optimization'
+);
 SELECT name, value FROM system.merge_tree_settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('remove_rolled_back_parts_immediately');

From 587c292a325f12d35e7be70157f38bb1e11ae717 Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <32552679+zvonand@users.noreply.github.com>
Date: Fri, 24 Apr 2026 01:59:49 +0200
Subject: [PATCH 2/6] Update SettingsChangesHistory.cpp

---
 src/Core/SettingsChangesHistory.cpp | 30 +++--------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 776393441e03..7961f095bc5b 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -308,10 +308,10 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             // {"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586
             // {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."},
             // {"object_storage_remote_initiator", false, false, "New setting."},
-            // {"allow_experimental_iceberg_read_optimization", true, true, "New setting."},
+            {"allow_experimental_iceberg_read_optimization", true, true, "New setting."},
             // {"object_storage_cluster_join_mode", "allow", "allow", "New setting"},
-            // {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."},
-            // {"allow_retries_in_cluster_requests", false, false, "New setting"},
+            {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."},
+            {"allow_retries_in_cluster_requests", false, false, "New setting"},
             // {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."},
             // {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."},
             // {"export_merge_tree_partition_force_export", false, false, "New setting."},
@@ -329,30 +329,6 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             // {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."},
             // {"object_storage_cluster", "", "", "Antalya: New setting"},
             // {"object_storage_max_nodes", 0, 0, "Antalya: New setting"},
-            {"allow_experimental_iceberg_read_optimization", true, true, "New setting."},
-            {"object_storage_cluster_join_mode", "allow", "allow", "New setting"},
-            {"lock_object_storage_task_distribution_ms", 500, 500, "New setting."},
-            {"allow_retries_in_cluster_requests", false, false, "New setting"},
-        //     {"object_storage_remote_initiator", false, false, "New setting."},
-            {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."},
-            {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."},
-            {"export_merge_tree_partition_force_export", false, false, "New setting."},
-            {"export_merge_tree_partition_max_retries", 3, 3, "New setting."},
-            {"export_merge_tree_partition_manifest_ttl", 180, 180, "New setting."},
-            {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."},
-        //     {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."},
-            {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."},
-            {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."},
-            {"enable_alias_marker", true, true, "New setting."},
-        //     {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"},
-        //     {"input_format_parquet_verify_checksums", true, true, "New setting."},
-        //     {"output_format_parquet_write_checksums", false, true, "New setting."},
-            {"export_merge_tree_part_max_bytes_per_file", 0, 0, "New setting."},
-            {"export_merge_tree_part_max_rows_per_file", 0, 0, "New setting."},
-        //     {"cluster_table_function_split_granularity", "file", "file", "New setting."},
-        //     {"cluster_table_function_buckets_batch_size", 0, 0, "New setting."},
-        {"export_merge_tree_part_throw_on_pending_mutations", true, true, "New setting."},
-        {"export_merge_tree_part_throw_on_pending_patch_parts", true, true, "New setting."},
         });
         addSettingsChanges(settings_changes_history, "25.8",
         {

From 3be71969a0764c4ce8b44989cd4ec95fd93b18af Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <zvonov.andrey@gmail.com>
Date: Fri, 24 Apr 2026 16:36:12 +0200
Subject: [PATCH 3/6] Use value_bounds instead of ColumnInfo::hyperrectangle
 for min/max

Removes the `hyperrectangle` field from `DB::Iceberg::ColumnInfo` that
was re-added during the frontport. The field was removed upstream in
PR https://github.com/ClickHouse/ClickHouse/pull/98231, which relocated
raw min/max bounds to `ParsedManifestFileEntry::value_bounds`. The
`DataFileMetaInfo` Iceberg constructor now deserializes those bounds via
the shared `deserializeFieldFromBinaryRepr` helper (moved from
`ManifestFileIterator.cpp` to `IcebergFieldParseHelpers`).

Addresses @ianton-ru's comment at https://github.com/Altinity/ClickHouse/pull/1687#discussion_r3137852430.
---
 .../DataLakes/IDataLakeMetadata.cpp           | 43 +++++++++-
 .../DataLakes/IDataLakeMetadata.h             |  3 +-
 .../Iceberg/IcebergFieldParseHelpers.cpp      | 73 +++++++++++++++++
 .../Iceberg/IcebergFieldParseHelpers.h        |  7 ++
 .../DataLakes/Iceberg/IcebergIterator.cpp     |  3 +-
 .../DataLakes/Iceberg/ManifestFile.h          |  2 -
 .../Iceberg/ManifestFileIterator.cpp          | 81 +------------------
 7 files changed, 124 insertions(+), 88 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
index c0fb681f9595..2dae8857d04c 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
@@ -1,6 +1,8 @@
 #include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h>
+#include <Core/TypeId.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
 #include <Core/Field.h>
@@ -100,31 +102,64 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat(
 DataFileMetaInfo::DataFileMetaInfo(
     const Iceberg::IcebergSchemaProcessor & schema_processor,
     Int32 schema_id,
-    const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_)
+    const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_,
+    const std::unordered_map<Int32, std::pair<Field, Field>> & value_bounds_)
 {
-
+#if USE_AVRO
     std::vector<Int32> column_ids;
     for (const auto & column : columns_info_)
         column_ids.push_back(column.first);
 
     auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids);
     std::unordered_map<Int32, std::string> name_by_index;
+    std::unordered_map<Int32, DataTypePtr> type_by_index;
     for (const auto & name_and_type : name_and_types)
     {
         const auto name = name_and_type.getNameInStorage();
         auto index = schema_processor.tryGetColumnIDByName(schema_id, name);
         if (index.has_value())
+        {
             name_by_index[index.value()] = name;
+            type_by_index[index.value()] = name_and_type.type;
+        }
     }
 
     for (const auto & column : columns_info_)
     {
         auto i_name = name_by_index.find(column.first);
-        if (i_name != name_by_index.end())
+        if (i_name == name_by_index.end())
+            continue;
+
+        std::optional<DB::Range> hyperrectangle;
+
+        auto i_bounds = value_bounds_.find(column.first);
+        auto i_type = type_by_index.find(column.first);
+        if (i_bounds != value_bounds_.end() && i_type != type_by_index.end())
         {
-            columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, column.second.hyperrectangle};
+            const auto & type = i_type->second;
+            if (const auto type_id = type->getTypeId();
+                type_id != TypeIndex::Tuple && type_id != TypeIndex::Map && type_id != TypeIndex::Array)
+            {
+                String left_str;
+                String right_str;
+                if (i_bounds->second.first.tryGet(left_str) && i_bounds->second.second.tryGet(right_str))
+                {
+                    auto left = Iceberg::deserializeFieldFromBinaryRepr(left_str, type, true);
+                    auto right = Iceberg::deserializeFieldFromBinaryRepr(right_str, type, false);
+                    if (left && right)
+                        hyperrectangle = DB::Range(*left, true, *right, true);
+                }
+            }
         }
+
+        columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, hyperrectangle};
     }
+#else
+    (void)schema_processor;
+    (void)schema_id;
+    (void)columns_info_;
+    (void)value_bounds_;
+#endif
 }
 
 DataFileMetaInfo::DataFileMetaInfo(Poco::JSON::Object::Ptr file_info)
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index 945618a97ad1..a7fb48a8a2b5 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -63,7 +63,8 @@ class DataFileMetaInfo
     explicit DataFileMetaInfo(
         const Iceberg::IcebergSchemaProcessor & schema_processor,
         Int32 schema_id,
-        const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_);
+        const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_,
+        const std::unordered_map<Int32, std::pair<Field, Field>> & value_bounds_);
 
     void serialize(WriteBuffer & out) const;
     static DataFileMetaInfo deserialize(ReadBuffer & in);
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp
index 0874b663303a..1e84302758c4 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.cpp
@@ -4,7 +4,10 @@
 #include <cctype>
 #include <limits>
 
+#include <Columns/IColumn.h>
 #include <Common/Exception.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesDecimal.h>
 #include <Parsers/Prometheus/parseTimeSeriesTypes.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h>
 
@@ -98,6 +101,76 @@ std::vector<Int64> fieldToInt64Array(const Field & value, std::string_view conte
     return result;
 }
 
+std::optional<Field> deserializeFieldFromBinaryRepr(const std::string & str, DataTypePtr expected_type, bool lower_bound)
+{
+    auto non_nullable_type = removeNullable(expected_type);
+    auto column = non_nullable_type->createColumn();
+    if (WhichDataType(non_nullable_type).isDecimal())
+    {
+        /// Iceberg store decimal values as unscaled value with two's-complement big-endian binary
+        /// using the minimum number of bytes for the value
+        /// Our decimal binary representation is little endian
+        /// so we cannot reuse our default code for parsing it.
+        int64_t unscaled_value = 0;
+
+        // Convert from big-endian to signed int
+        for (const auto byte : str)
+            unscaled_value = (unscaled_value << 8) | static_cast<uint8_t>(byte);
+
+        /// Add sign
+        if (str[0] & 0x80)
+        {
+            int64_t sign_extension = -1;
+            sign_extension <<= (str.size() * 8);
+            unscaled_value |= sign_extension;
+        }
+
+        /// NOTE: It's very weird, but Decimal values for lower bound and upper bound
+        /// are stored rounded, without fractional part. What is more strange
+        /// the integer part is rounded mathematically correctly according to fractional part.
+        /// Example: 17.22 -> 17, 8888.999 -> 8889, 1423.77 -> 1424.
+        /// I've checked two implementations: Spark and Amazon Athena and both of them
+        /// do this.
+        ///
+        /// The problem is -- we cannot use rounded values for lower bounds and upper bounds.
+        /// Example: upper_bound(x) = 17.22, but it's rounded 17.00, now condition WHERE x >= 17.21 will
+        /// check rounded value and say: "Oh largest value is 17, so values bigger than 17.21 cannot be in this file,
+        /// let's skip it". But it will produce incorrect result since actual value (17.22 >= 17.21) is stored in this file.
+        ///
+        /// To handle this issue we subtract 1 from the integral part for lower_bound and add 1 to integral
+        /// part of upper_bound. This produces: 17.22 -> [16.0, 18.0]. So this is more rough boundary,
+        /// but at least it doesn't lead to incorrect results.
+        if (int32_t scale = getDecimalScale(*non_nullable_type))
+        {
+            int64_t scaler = lower_bound ? -10 : 10;
+            while (--scale)
+                scaler *= 10;
+
+            unscaled_value += scaler;
+        }
+
+        if (const auto * decimal_type = checkDecimal<Decimal32>(*non_nullable_type))
+        {
+            DecimalField<Decimal32> result(static_cast<Int32>(unscaled_value), decimal_type->getScale());
+            return result;
+        }
+        if (const auto * decimal_type = checkDecimal<Decimal64>(*non_nullable_type))
+        {
+            DecimalField<Decimal64> result(unscaled_value, decimal_type->getScale());
+            return result;
+        }
+
+        return std::nullopt;
+    }
+
+    /// For all other types except decimal binary representation
+    /// matches our internal representation
+    column->insertData(str.data(), str.length());
+    Field result;
+    column->get(0, result);
+    return result;
+}
+
 }
 }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h
index 285062cb3d4f..21156647fd4b 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h
@@ -4,11 +4,14 @@
 
 #if USE_AVRO
 
+#include <optional>
+#include <string>
 #include <string_view>
 #include <vector>
 
 #include <Core/Field.h>
 #include <Core/Types.h>
+#include <DataTypes/IDataType.h>
 
 namespace DB::Iceberg
 {
@@ -25,6 +28,10 @@ Int64 fieldToPeriodMs(const Field & value, std::string_view context, std::string
 /// Convert a Field containing an Array to vector<Int64>, validating each element.
 std::vector<Int64> fieldToInt64Array(const Field & value, std::string_view context, std::string_view arg_name);
 
+/// Deserialize a single lower/upper bound value from Iceberg's binary representation.
+/// See https://iceberg.apache.org/spec/#appendix-d-single-value-serialization
+std::optional<Field> deserializeFieldFromBinaryRepr(const std::string & str, DataTypePtr expected_type, bool lower_bound);
+
 }
 
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
index bf22cc15ac6e..7de8317abad2 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
@@ -397,7 +397,8 @@ ObjectInfoPtr IcebergIterator::next(size_t)
         object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared<DataFileMetaInfo>(
                                     *persistent_components.schema_processor,
                                     table_schema_id, /// current schema id to use current column names
-                                    manifest_file_entry->parsed_entry->columns_infos));
+                                    manifest_file_entry->parsed_entry->columns_infos,
+                                    manifest_file_entry->parsed_entry->value_bounds));
         ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos);
         return object_info;
     }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index c7453440512e..3a1bccc53411 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "config.h"
-#include <Core/Range.h>
 #include <Core/Types.h>
 
 #include <optional>
@@ -14,7 +13,6 @@ struct ColumnInfo
     std::optional<Int64> rows_count;
     std::optional<Int64> bytes_size;
     std::optional<Int64> nulls_count;
-    std::optional<DB::Range> hyperrectangle;
 };
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp
index 9142b458d816..d68fa264e0f1 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp
@@ -9,6 +9,7 @@
 #include <Interpreters/IcebergMetadataLog.h>
 
 #include <Storages/ObjectStorage/DataLakes/Iceberg/Constant.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergFieldParseHelpers.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFilesPruning.h>
 #include <Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.h>
@@ -45,86 +46,6 @@ namespace DB::Iceberg
 
 using namespace DB;
 
-namespace
-{
-    /// Iceberg stores lower_bounds and upper_bounds serialized with some custom deserialization as bytes array
-    /// https://iceberg.apache.org/spec/#appendix-d-single-value-serialization
-    std::optional<DB::Field> deserializeFieldFromBinaryRepr(std::string str, DB::DataTypePtr expected_type, bool lower_bound)
-    {
-        auto non_nullable_type = DB::removeNullable(expected_type);
-        auto column = non_nullable_type->createColumn();
-        if (DB::WhichDataType(non_nullable_type).isDecimal())
-        {
-            /// Iceberg store decimal values as unscaled value with two's-complement big-endian binary
-            /// using the minimum number of bytes for the value
-            /// Our decimal binary representation is little endian
-            /// so we cannot reuse our default code for parsing it.
-            int64_t unscaled_value = 0;
-
-            // Convert from big-endian to signed int
-            for (const auto byte : str)
-                unscaled_value = (unscaled_value << 8) | static_cast<uint8_t>(byte);
-
-            /// Add sign
-            if (str[0] & 0x80)
-            {
-                int64_t sign_extension = -1;
-                sign_extension <<= (str.size() * 8);
-                unscaled_value |= sign_extension;
-            }
-
-            /// NOTE: It's very weird, but Decimal values for lower bound and upper bound
-            /// are stored rounded, without fractional part. What is more strange
-            /// the integer part is rounded mathematically correctly according to fractional part.
-            /// Example: 17.22 -> 17, 8888.999 -> 8889, 1423.77 -> 1424.
-            /// I've checked two implementations: Spark and Amazon Athena and both of them
-            /// do this.
-            ///
-            /// The problem is -- we cannot use rounded values for lower bounds and upper bounds.
-            /// Example: upper_bound(x) = 17.22, but it's rounded 17.00, now condition WHERE x >= 17.21 will
-            /// check rounded value and say: "Oh largest value is 17, so values bigger than 17.21 cannot be in this file,
-            /// let's skip it". But it will produce incorrect result since actual value (17.22 >= 17.21) is stored in this file.
-            ///
-            /// To handle this issue we subtract 1 from the integral part for lower_bound and add 1 to integral
-            /// part of upper_bound. This produces: 17.22 -> [16.0, 18.0]. So this is more rough boundary,
-            /// but at least it doesn't lead to incorrect results.
-            if (int32_t scale = DB::getDecimalScale(*non_nullable_type))
-            {
-                int64_t scaler = lower_bound ? -10 : 10;
-                while (--scale)
-                    scaler *= 10;
-
-                unscaled_value += scaler;
-            }
-
-            if (const auto * decimal_type = DB::checkDecimal<DB::Decimal32>(*non_nullable_type))
-            {
-                DB::DecimalField<DB::Decimal32> result(static_cast<Int32>(unscaled_value), decimal_type->getScale());
-                return result;
-            }
-            if (const auto * decimal_type = DB::checkDecimal<DB::Decimal64>(*non_nullable_type))
-            {
-                DB::DecimalField<DB::Decimal64> result(unscaled_value, decimal_type->getScale());
-                return result;
-            }
-            else
-            {
-                return std::nullopt;
-            }
-        }
-        else
-        {
-            /// For all other types except decimal binary representation
-            /// matches our internal representation
-            column->insertData(str.data(), str.length());
-            DB::Field result;
-            column->get(0, result);
-            return result;
-        }
-    }
-
-}
-
 const std::vector<ProcessedManifestFileEntryPtr> &
 ManifestFileIterator::ManifestFileEntriesHandle::getFilesWithoutDeleted(FileContentType content_type) const
 {

From 741c2598e072c2e6f5e27aea88d3a941b614d0b6 Mon Sep 17 00:00:00 2001
From: Mikhail Koviazin <github@mkmk.aleeas.com>
Date: Mon, 30 Mar 2026 12:44:24 +0200
Subject: [PATCH 4/6] Fix row policies silently ignored on Iceberg tables with
 PREWHERE enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Iceberg read optimization (`allow_experimental_iceberg_read_optimization`)
identifies constant columns from Iceberg metadata and removes them from the
read request. When all requested columns become constant, it sets
`need_only_count = true`, which tells the Parquet reader to skip all
initialization — including `preparePrewhere` — and just return the raw row
count from file metadata.

This completely bypasses `row_level_filter` (row policies) and `prewhere_info`,
returning unfiltered row counts. The InterpreterSelectQuery relies on the
storage to apply these filters when `supportsPrewhere` is true and does not
add a fallback FilterStep to the query plan, so the filter is silently lost.

The fix prevents `need_only_count` from being set when an active
`row_level_filter` or `prewhere_info` exists in the format filter info.

Fixes #1595

(cherry picked from commit f204850f4a481f9f9478cd5ff98d2769f6271439)
---
 src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index 1f3e1e7419ba..658c3cd6525f 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -761,7 +761,8 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
             if (requested_columns_copy.size() + constant_columns.size() != original_columns)
                 throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't remove constant columns for file {} correct, fallback to read. Founded constant columns: [{}]",
                     object_info->getPath(), constant_columns);
-            if (requested_columns_copy.empty())
+            if (requested_columns_copy.empty()
+                && (!format_filter_info || (!format_filter_info->row_level_filter && !format_filter_info->prewhere_info)))
                 need_only_count = true;
         }
     }

From 037825f31f59c54918b8be08ca3bedd3f8ab8292 Mon Sep 17 00:00:00 2001
From: Mikhail Koviazin <github@mkmk.aleeas.com>
Date: Tue, 31 Mar 2026 14:30:26 +0200
Subject: [PATCH 5/6] Fix Iceberg read optimization replacing prewhere columns
 with constant NULLs

The Altinity-specific constant column optimization
(`allow_experimental_iceberg_read_optimization`) scans `requested_columns`
for nullable columns absent from the Iceberg file metadata and replaces
them with constant NULLs. However, `requested_columns` can also contain
columns produced by `prewhere_info` or `row_level_filter` expressions
(e.g. `equals(boolean_col, false)`). These computed columns are not in
the file metadata, and their result type is often `Nullable(UInt8)`, so
the optimization incorrectly treats them as missing file columns and
replaces them with NULLs.

This corrupts the prewhere pipeline: the Parquet reader evaluates the
filter expression correctly, but the constant column optimization then
overwrites the result with NULLs. With `need_filter = false` (old planner,
PREWHERE + WHERE), all rows appear to fail the filter, producing empty
output. With `need_filter = true`, the filter column is NULL so all rows
are filtered out.

The fix skips columns that match the `prewhere_info` or `row_level_filter`
column names, since these are computed at read time and never stored in
the file.

(cherry picked from commit b7696a30dbf8064963b5e75ec45f5882e55bb7de)
---
 src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index 658c3cd6525f..895c964422f4 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -739,6 +739,13 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
                 if (!column.second.second.type->isNullable())
                     continue;
 
+                /// Skip columns produced by prewhere or row-level filter expressions —
+                /// they are computed at read time, not stored in the file.
+                if (format_filter_info
+                    && ((format_filter_info->prewhere_info && column_name == format_filter_info->prewhere_info->prewhere_column_name)
+                        || (format_filter_info->row_level_filter && column_name == format_filter_info->row_level_filter->column_name)))
+                    continue;
+
                 /// Column is nullable and absent in file
                 constant_columns_with_values[column.second.first] =
                     ConstColumnWithValue{

From 5959fbdd1ec7b13674b6ff8c87447d2367508150 Mon Sep 17 00:00:00 2001
From: Andrey Zvonov <zvonov.andrey@gmail.com>
Date: Mon, 27 Apr 2026 11:43:09 +0200
Subject: [PATCH 6/6] Decode Iceberg value_bounds with the file's schema, not
 the table's
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`DataFileMetaInfo::DataFileMetaInfo` (Iceberg constructor introduced in
3be71969a07) deserialized `value_bounds` using the table's current schema.
After schema evolution (e.g. `int` -> `long`) the bytes were still encoded
with the file's old type — a 4-byte int — but were read as 8 bytes for
`Int64`. `ColumnVector::insertData` ignores the length argument and always
reads `sizeof(T)` bytes via `unalignedLoad`, so the extra 4 bytes came from
adjacent memory and produced a garbage hyperrectangle.

The garbage range often satisfied `Range::isPoint`, which made the iceberg
read optimization replace the column with a constant value taken from the
garbage bound, corrupting query results.

Pass the file's `resolved_schema_id` separately so types are looked up
against the schema the data file was written with, while column names
keep coming from the current table schema (so the resulting `columns_info`
map is keyed by names callers know about).

Reproducer: `test_storage_iceberg_schema_evolution/test_evolved_schema_simple.py::test_evolved_schema_simple` —
all 12 parametrizations failed at the assertion after `ALTER COLUMN a TYPE BIGINT`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../DataLakes/IDataLakeMetadata.cpp           | 27 +++++++++++++------
 .../DataLakes/IDataLakeMetadata.h             | 12 +++++++--
 .../DataLakes/Iceberg/IcebergIterator.cpp     |  1 +
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
index 2dae8857d04c..e553907b4a04 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp
@@ -101,7 +101,8 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat(
 
 DataFileMetaInfo::DataFileMetaInfo(
     const Iceberg::IcebergSchemaProcessor & schema_processor,
-    Int32 schema_id,
+    Int32 table_schema_id,
+    Int32 file_schema_id,
     const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_,
     const std::unordered_map<Int32, std::pair<Field, Field>> & value_bounds_)
 {
@@ -110,18 +111,27 @@ DataFileMetaInfo::DataFileMetaInfo(
     for (const auto & column : columns_info_)
         column_ids.push_back(column.first);
 
-    auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids);
+    /// Names are resolved via the table schema so that the resulting `columns_info`
+    /// map is keyed by the current column names that callers know about.
+    auto table_name_and_types = schema_processor.tryGetFieldsCharacteristics(table_schema_id, column_ids);
     std::unordered_map<Int32, std::string> name_by_index;
-    std::unordered_map<Int32, DataTypePtr> type_by_index;
-    for (const auto & name_and_type : name_and_types)
+    for (const auto & name_and_type : table_name_and_types)
     {
         const auto name = name_and_type.getNameInStorage();
-        auto index = schema_processor.tryGetColumnIDByName(schema_id, name);
+        auto index = schema_processor.tryGetColumnIDByName(table_schema_id, name);
         if (index.has_value())
-        {
             name_by_index[index.value()] = name;
+    }
+
+    /// Types come from the file's schema because `value_bounds_` are encoded with
+    /// that schema's column types — see Iceberg single-value serialization spec.
+    std::unordered_map<Int32, DataTypePtr> type_by_index;
+    auto file_name_and_types = schema_processor.tryGetFieldsCharacteristics(file_schema_id, column_ids);
+    for (const auto & name_and_type : file_name_and_types)
+    {
+        auto index = schema_processor.tryGetColumnIDByName(file_schema_id, name_and_type.getNameInStorage());
+        if (index.has_value())
             type_by_index[index.value()] = name_and_type.type;
-        }
     }
 
     for (const auto & column : columns_info_)
@@ -156,7 +166,8 @@ DataFileMetaInfo::DataFileMetaInfo(
     }
 #else
     (void)schema_processor;
-    (void)schema_id;
+    (void)table_schema_id;
+    (void)file_schema_id;
     (void)columns_info_;
     (void)value_bounds_;
 #endif
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index a7fb48a8a2b5..1a43b9f20a85 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -59,10 +59,18 @@ class DataFileMetaInfo
         std::optional<DB::Range> hyperrectangle;
     };
 
-    // Extract metadata from Iceberg structure
+    // Extract metadata from Iceberg structure.
+    // table_schema_id is the current table schema, used to resolve column names that
+    // appear as keys in the resulting `columns_info` map.
+    // file_schema_id is the schema the data file (and its `value_bounds_`) was written
+    // with — bounds bytes are encoded with that schema's column types, so they must be
+    // deserialized using those types. After schema evolution (e.g. `int` -> `long`)
+    // the two ids differ, and using the table schema's type would misinterpret the
+    // bytes and produce a garbage hyperrectangle.
     explicit DataFileMetaInfo(
         const Iceberg::IcebergSchemaProcessor & schema_processor,
-        Int32 schema_id,
+        Int32 table_schema_id,
+        Int32 file_schema_id,
         const std::unordered_map<Int32, Iceberg::ColumnInfo> & columns_info_,
         const std::unordered_map<Int32, std::pair<Field, Field>> & value_bounds_);
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
index 7de8317abad2..97de3771264e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp
@@ -397,6 +397,7 @@ ObjectInfoPtr IcebergIterator::next(size_t)
         object_info->relative_path_with_metadata.setFileMetaInfo(std::make_shared<DataFileMetaInfo>(
                                     *persistent_components.schema_processor,
                                     table_schema_id, /// current schema id to use current column names
+                                    manifest_file_entry->resolved_schema_id, /// file's schema id to interpret value_bounds bytes
                                     manifest_file_entry->parsed_entry->columns_infos,
                                     manifest_file_entry->parsed_entry->value_bounds));
         ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos);