rapidsai · wence- · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 24, 2025
@@ -643,10 +643,13 @@ add_library(
   src/reshape/interleave_columns.cu
   src/reshape/tile.cu
   src/rolling/detail/optimized_unbounded_window.cpp
+  src/rolling/detail/range_following.cu
+  src/rolling/detail/range_preceding.cu
   src/rolling/detail/rolling_collect_list.cu
   src/rolling/detail/rolling_fixed_window.cu
   src/rolling/detail/rolling_variable_window.cu
   src/rolling/grouped_rolling.cu
+  src/rolling/range_rolling.cu
   src/rolling/range_window_bounds.cpp
   src/rolling/rolling.cu
   src/round/round.cu

@@ -428,7 +428,10 @@ ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 # ##################################################################################################
 # * rolling benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+ConfigureNVBench(
+  ROLLING_NVBENCH rolling/grouped_range_rolling_sum.cu rolling/grouped_rolling_sum.cpp
+  rolling/range_rolling_sum.cu rolling/rolling_sum.cpp
+)
 
 add_custom_target(
   run_benchmarks

@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+void bench_grouped_range_rolling_sum(nvbench::state& state)
+{
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  // Configurable parameter is window range.
+  // Since orderby column is approximately equally spaced at unit
+  // intervals, this approximately controls the number of entries in
+  // the window.
+  auto const preceding_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const following_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  auto vals = [&]() {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<std::int32_t>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<std::int32_t>(), row_count{num_rows}, profile);
+  }();
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys = create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  auto orderby = [&]() {
+    auto seq = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, num_rows);
+    // Equally spaced rows separated by 1000 unit intervals
+    thrust::transform(
+      rmm::exec_policy(cudf::get_default_stream()),
+      thrust::make_counting_iterator(static_cast<cudf::size_type>(0)),
+      thrust::make_counting_iterator(static_cast<cudf::size_type>(num_rows)),
+      seq->mutable_view().begin<cudf::size_type>(),
+      [] __device__(cudf::size_type i) { return static_cast<cudf::size_type>(i) * 1000; });
+    // Add some units of noise
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<cudf::duration_ms>(), distribution_id::NORMAL, -2000, 2000);
+    profile.set_null_probability(has_nulls ? std::optional<double>{400.0 / num_rows}
+                                           : std::nullopt);
+    auto noise =
+      create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
+    auto result =
+      cudf::binary_operation(seq->view(), noise->view(), cudf::binary_operator::ADD, seq->type());
+    auto columns = cudf::sort_by_key(cudf::table_view{{result->view()}},
+                                     cudf::table_view{{keys->get_column(0).view(), result->view()}},
+                                     {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                                     {cudf::null_order::AFTER, cudf::null_order::AFTER})
+                     ->release();
+    return std::move(columns[0]);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::grouped_range_rolling_window(keys->view(),
+                                         orderby->view(),
+                                         cudf::order::ASCENDING,
+                                         vals->view(),
+                                         cudf::range_window_bounds::get(preceding_range),
+                                         cudf::range_window_bounds::get(following_range),
+                                         1,
+                                         *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_range", {100})
+  .add_int64_axis("following_range", {100})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_range", {10'000, 40'000})
+  .add_int64_axis("following_range", {0})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {100});
@@ -68,3 +68,12 @@ NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
   .add_int64_axis("following_size", {2})
   .add_int64_axis("min_periods", {1})
   .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+
+NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t>))
+  .set_name("row_grouped_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_size", {10'000, 40'000})
+  .add_int64_axis("following_size", {0})
+  .add_int64_axis("min_periods", {1})
+  .add_int64_axis("cardinality", {10, 100});
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+void bench_range_rolling_sum(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  // Configurable parameter is window range in seconds.
+  // Since orderby column is approximately equally spaced at 1s
+  // intervals, this approximately controls the number of entries in
+  // the window.
+  auto const preceding_range = cudf::duration_scalar<cudf::duration_ms>{
+    cudf::duration_ms{state.get_int64("preceding_range") * 1000}, true};
+  auto const following_range = cudf::duration_scalar<cudf::duration_ms>{
+    cudf::duration_ms{state.get_int64("following_range") * 1000}, true};
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  auto vals = [&]() {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<std::int32_t>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<std::int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto orderby = [&]() {
+    auto seq =
+      cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}, num_rows);
+    // Equally spaced rows separated by 1s
+    thrust::transform(
+      rmm::exec_policy(cudf::get_default_stream()),
+      thrust::make_counting_iterator(static_cast<cudf::size_type>(0)),
+      thrust::make_counting_iterator(static_cast<cudf::size_type>(num_rows)),
+      seq->mutable_view().begin<cudf::timestamp_ms>(),
+      [] __device__(cudf::size_type i) {
+        return cudf::timestamp_ms{cudf::duration_ms{static_cast<std::int64_t>(i) * 1000}};
+      });
+    // Add some milliseconds of noise
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<cudf::duration_ms>(), distribution_id::NORMAL, -2000, 2000);
+    profile.set_null_probability(has_nulls ? std::optional<double>{400.0 / num_rows}
+                                           : std::nullopt);
+    auto noise =
+      create_random_column(cudf::type_to_id<cudf::duration_ms>(), row_count{num_rows}, profile);
+    auto result =
+      cudf::binary_operation(seq->view(), noise->view(), cudf::binary_operator::ADD, seq->type());
+    auto columns =
+      cudf::sort(
+        cudf::table_view{{result->view()}}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER})
+        ->release();
+    return std::move(columns[0]);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::grouped_range_rolling_window(cudf::table_view{},
+                                         orderby->view(),
+                                         cudf::order::ASCENDING,
+                                         vals->view(),
+                                         cudf::range_window_bounds::get(preceding_range),
+                                         cudf::range_window_bounds::get(following_range),
+                                         1,
+                                         *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH(bench_range_rolling_sum)
+  .set_name("range_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_range", {100})
+  .add_int64_axis("following_range", {100})
+  .add_int64_axis("has_nulls", {0, 1});
+
+NVBENCH_BENCH(bench_range_rolling_sum)
+  .set_name("range_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_range", {10'000, 40'000})
+  .add_int64_axis("following_range", {0})
+  .add_int64_axis("has_nulls", {0, 1});
@@ -132,3 +132,11 @@ NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
   .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
   .add_int64_axis("preceding_size", {10, 100})
   .add_int64_axis("following_size", {2});
+
+NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t>))
+  .set_name("row_fixed_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_size", {10'000, 40'000})
+  .add_int64_axis("following_size", {0})
+  .add_int64_axis("min_periods", {1});