From 5d463d42352ed65c2e991f156271b1d2382f92a1 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 30 Oct 2024 00:35:55 +0000
Subject: [PATCH 1/5] added stream support

---
 cpp/include/nvtext/subword_tokenize.hpp  |  2 ++
 cpp/src/text/subword/load_hash_file.cu   |  6 ++++--
 cpp/src/text/subword/subword_tokenize.cu | 11 +++--------
 cpp/tests/CMakeLists.txt                 |  1 +
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index c4210699975..1f526ef2123 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -67,6 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -157,6 +158,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index eca703e2604..b13ad0a7de8 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -289,10 +289,12 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 }  // namespace detail
 
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
-  std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
+  std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
+  return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index d7e04a0c208..dee589d6daf 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -293,17 +293,12 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::subword_tokenize(strings,
-                                  vocabulary_table,
-                                  max_sequence_length,
-                                  stride,
-                                  do_lower_case,
-                                  do_truncate,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::subword_tokenize(
+    strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b78a64d0e55..6ef841b5fa5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -741,6 +741,7 @@ ConfigureTest(
   streams/text/ngrams_test.cpp
   streams/text/replace_test.cpp
   streams/text/stemmer_test.cpp
+  streams/text/subword_tokenize_test.cpp
   streams/text/tokenize_test.cpp
   STREAM_MODE
   testing

From bd1ade782a01da2678af48db680990cc9296d47a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 30 Oct 2024 00:44:54 +0000
Subject: [PATCH 2/5] doc

---
 cpp/include/nvtext/subword_tokenize.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 1f526ef2123..4d06aa5d4bc 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -62,6 +62,7 @@ struct hashed_vocabulary {
  * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
  *        Note that this is the file AFTER python/perfect_hash.py has been used
  *        for preprocessing.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return vocabulary hash-table elements
  */
@@ -148,6 +149,7 @@ struct tokenizer_result {
  * @param do_truncate If true, the tokenizer will discard all the token-ids after
  *        `max_sequence_length` for each input string. If false, it will use a new row
  *        in the output token-ids to continue generating the output.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */

From 3f8abec71526fcfe400e561e1afbf172be43464f Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 30 Oct 2024 15:54:12 +0000
Subject: [PATCH 3/5] oops, forgto to add a file!

---
 .../streams/text/subword_tokenize_test.cpp    | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 cpp/tests/streams/text/subword_tokenize_test.cpp

diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp
new file mode 100644
index 00000000000..954898b4722
--- /dev/null
+++ b/cpp/tests/streams/text/subword_tokenize_test.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/subword_tokenize.hpp>
+
+#include <fstream>
+#include <vector>
+
+// Global environment for temporary files
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+struct TextSubwordTest : public cudf::test::BaseFixture {};
+
+// Create a fake hashed vocab text file for the tests in this source file.
+// The vocab only includes the following words:
+//  'this', 'is', 'a', 'test', 'tést'
+// The period '.' character also has a token id.
+void create_hashed_vocab(std::string const& hash_file)
+{
+  std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
+  std::ofstream outfile(hash_file, std::ofstream::out);
+  outfile << "1\n0\n" << coefficients.size() << "\n";
+  for (auto c : coefficients)
+    outfile << c.first << " " << c.second << "\n";
+  std::vector<uint64_t> hash_table(23, 0);
+  outfile << hash_table.size() << "\n";
+  hash_table[0]  = 3015668L;              // based on values
+  hash_table[1]  = 6205475701751155871L;  // from the
+  hash_table[5]  = 6358029;               // bert_hash_table.txt
+  hash_table[16] = 451412625363L;         // file for the test
+  hash_table[20] = 6206321707968235495L;  // words above
+  for (auto h : hash_table)
+    outfile << h << "\n";
+  outfile << "100\n101\n102\n\n";
+}
+
+TEST(TextSubwordTest, Tokenize)
+{
+  uint32_t nrows = 100;
+  std::vector<char const*> h_strings(nrows, "This is a test. A test this is.");
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream());
+
+  uint32_t max_sequence_length = 16;
+  uint32_t stride              = 16;
+
+  auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
+                                         *vocab,
+                                         max_sequence_length,
+                                         stride,
+                                         true,   // do_lower_case
+                                         false,  // do_truncate
+                                         cudf::test::get_default_stream());
+
+  EXPECT_EQ(nrows, result.nrows_tensor);
+
+  {
+    std::vector<uint32_t> base_data(
+      {2023, 2003, 1037, 3231, 1012, 1037, 3231, 2023, 2003, 1012, 0, 0, 0, 0, 0, 0});
+    std::vector<uint32_t> h_expected;
+    for (uint32_t idx = 0; idx < nrows; ++idx)
+      h_expected.insert(h_expected.end(), base_data.begin(), base_data.end());
+    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
+  }
+
+  {
+    std::vector<uint32_t> base_data({1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0});
+    std::vector<uint32_t> h_expected;
+    for (uint32_t idx = 0; idx < nrows; ++idx)
+      h_expected.insert(h_expected.end(), base_data.begin(), base_data.end());
+    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
+  }
+
+  {
+    std::vector<uint32_t> h_expected;
+    for (uint32_t idx = 0; idx < nrows; ++idx) {
+      // 0,0,9,1,0,9,2,0,9,3,0,9,4,0,9,5,0,9,6,0,9,7,0,9,8,0,9,9,0,9,...
+      h_expected.push_back(idx);
+      h_expected.push_back(0);
+      h_expected.push_back(9);
+    }
+    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected);
+  }
+}

From cf65f0ece351f2035cda050627738e7c297529e8 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 30 Oct 2024 19:39:55 +0000
Subject: [PATCH 4/5] simplify test

---
 .../streams/text/subword_tokenize_test.cpp    | 33 -------------------
 1 file changed, 33 deletions(-)

diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp
index 954898b4722..928db6b2042 100644
--- a/cpp/tests/streams/text/subword_tokenize_test.cpp
+++ b/cpp/tests/streams/text/subword_tokenize_test.cpp
@@ -75,37 +75,4 @@ TEST(TextSubwordTest, Tokenize)
                                          true,   // do_lower_case
                                          false,  // do_truncate
                                          cudf::test::get_default_stream());
-
-  EXPECT_EQ(nrows, result.nrows_tensor);
-
-  {
-    std::vector<uint32_t> base_data(
-      {2023, 2003, 1037, 3231, 1012, 1037, 3231, 2023, 2003, 1012, 0, 0, 0, 0, 0, 0});
-    std::vector<uint32_t> h_expected;
-    for (uint32_t idx = 0; idx < nrows; ++idx)
-      h_expected.insert(h_expected.end(), base_data.begin(), base_data.end());
-    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
-  }
-
-  {
-    std::vector<uint32_t> base_data({1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0});
-    std::vector<uint32_t> h_expected;
-    for (uint32_t idx = 0; idx < nrows; ++idx)
-      h_expected.insert(h_expected.end(), base_data.begin(), base_data.end());
-    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
-  }
-
-  {
-    std::vector<uint32_t> h_expected;
-    for (uint32_t idx = 0; idx < nrows; ++idx) {
-      // 0,0,9,1,0,9,2,0,9,3,0,9,4,0,9,5,0,9,6,0,9,7,0,9,8,0,9,9,0,9,...
-      h_expected.push_back(idx);
-      h_expected.push_back(0);
-      h_expected.push_back(9);
-    }
-    cudf::test::fixed_width_column_wrapper<uint32_t> expected(h_expected.begin(), h_expected.end());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected);
-  }
 }

From 3dfeada4090c14aea1b0e118d1a8865fc790396d Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 31 Oct 2024 03:43:01 +0000
Subject: [PATCH 5/5] addressing pr comments

---
 .../streams/text/subword_tokenize_test.cpp    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp
index 928db6b2042..9474e6b269c 100644
--- a/cpp/tests/streams/text/subword_tokenize_test.cpp
+++ b/cpp/tests/streams/text/subword_tokenize_test.cpp
@@ -39,34 +39,37 @@ struct TextSubwordTest : public cudf::test::BaseFixture {};
 // The period '.' character also has a token id.
 void create_hashed_vocab(std::string const& hash_file)
 {
-  std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
+  constexpr size_t coefsize = 23;
+  std::vector<std::pair<int, int>> coefficients(coefsize, {65559, 0});
   std::ofstream outfile(hash_file, std::ofstream::out);
   outfile << "1\n0\n" << coefficients.size() << "\n";
-  for (auto c : coefficients)
+  for (auto c : coefficients) {
     outfile << c.first << " " << c.second << "\n";
-  std::vector<uint64_t> hash_table(23, 0);
+  }
+  std::vector<uint64_t> hash_table(coefsize, 0);
   outfile << hash_table.size() << "\n";
   hash_table[0]  = 3015668L;              // based on values
   hash_table[1]  = 6205475701751155871L;  // from the
   hash_table[5]  = 6358029;               // bert_hash_table.txt
   hash_table[16] = 451412625363L;         // file for the test
   hash_table[20] = 6206321707968235495L;  // words above
-  for (auto h : hash_table)
+  for (auto h : hash_table) {
     outfile << h << "\n";
+  }
   outfile << "100\n101\n102\n\n";
 }
 
 TEST(TextSubwordTest, Tokenize)
 {
-  uint32_t nrows = 100;
+  uint32_t const nrows = 100;
   std::vector<char const*> h_strings(nrows, "This is a test. A test this is.");
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
-  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  cudf::test::strings_column_wrapper strings(h_strings.cbegin(), h_strings.cend());
+  std::string const hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
   auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream());
 
-  uint32_t max_sequence_length = 16;
-  uint32_t stride              = 16;
+  uint32_t const max_sequence_length = 16;
+  uint32_t const stride              = 16;
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                          *vocab,