Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into refactor-dask-cudf
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora authored Nov 4, 2024
2 parents a84128a + a2001dd commit ba3032a
Show file tree
Hide file tree
Showing 12 changed files with 882 additions and 152 deletions.
32 changes: 16 additions & 16 deletions cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ class distinct_hash_join {
* Result: {{1}, {0}}
* @endcode
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down Expand Up @@ -620,7 +620,7 @@ conditional_inner_join(table_view const& left,
* Result: {{0, 1, 2}, {None, 0, None}}
* @endcode
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down Expand Up @@ -666,7 +666,7 @@ conditional_left_join(table_view const& left,
* Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
* @endcode
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down Expand Up @@ -705,7 +705,7 @@ conditional_full_join(table_view const& left,
* Result: {1}
* @endcode
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down Expand Up @@ -746,7 +746,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
* Result: {0, 2}
* @endcode
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down Expand Up @@ -793,7 +793,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
* Result: {{1}, {0}}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -855,7 +855,7 @@ mixed_inner_join(
* Result: {{0, 1, 2}, {None, 0, None}}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -917,7 +917,7 @@ mixed_left_join(
* Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -972,7 +972,7 @@ mixed_full_join(
* Result: {1}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -1022,7 +1022,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
* Result: {0, 2}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -1061,7 +1061,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -1103,7 +1103,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
Expand Down Expand Up @@ -1142,7 +1142,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
* If the provided predicate returns NULL for a pair of rows
* (left, right), that pair is not included in the output.
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand All @@ -1167,7 +1167,7 @@ std::size_t conditional_inner_join_size(
* If the provided predicate returns NULL for a pair of rows
* (left, right), that pair is not included in the output.
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand All @@ -1192,7 +1192,7 @@ std::size_t conditional_left_join_size(
* If the provided predicate returns NULL for a pair of rows
* (left, right), that pair is not included in the output.
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand All @@ -1217,7 +1217,7 @@ std::size_t conditional_left_semi_join_size(
* If the provided predicate returns NULL for a pair of rows
* (left, right), that pair is not included in the output.
*
* @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
* @throw cudf::data_type_error if the binary predicate outputs a non-boolean result.
*
* @param left The left table
* @param right The right table
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/join/conditional_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -178,7 +179,8 @@ conditional_join(table_view const& left,
auto const parser =
ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
"The expression must produce a boolean output.");
"The expression must produce a boolean output.",
cudf::data_type_error);

auto left_table = table_device_view::create(left, stream);
auto right_table = table_device_view::create(right, stream);
Expand Down Expand Up @@ -330,7 +332,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
auto const parser =
ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
"The expression must produce a boolean output.");
"The expression must produce a boolean output.",
cudf::data_type_error);

auto left_table = table_device_view::create(left, stream);
auto right_table = table_device_view::create(right, stream);
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/join/mixed_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

Expand Down Expand Up @@ -115,7 +116,8 @@ mixed_join(
auto const parser = ast::detail::expression_parser{
binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
"The expression must produce a boolean output.");
"The expression must produce a boolean output.",
cudf::data_type_error);

// TODO: The non-conditional join impls start with a dictionary matching,
// figure out what that is and what it's needed for (and if conditional joins
Expand Down Expand Up @@ -381,7 +383,8 @@ compute_mixed_join_output_size(table_view const& left_equality,
auto const parser = ast::detail::expression_parser{
binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
"The expression must produce a boolean output.");
"The expression must produce a boolean output.",
cudf::data_type_error);

// TODO: The non-conditional join impls start with a dictionary matching,
// figure out what that is and what it's needed for (and if conditional joins
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ from pylibcudf.libcudf.io.csv cimport (
write_csv as cpp_write_csv,
)
from pylibcudf.libcudf.io.data_sink cimport data_sink
from pylibcudf.libcudf.io.types cimport compression_type, sink_info
from pylibcudf.libcudf.io.types cimport sink_info
from pylibcudf.libcudf.table.table_view cimport table_view

from cudf._lib.io.utils cimport make_sink_info
Expand Down Expand Up @@ -148,13 +148,13 @@ def read_csv(
byte_range = (0, 0)

if compression is None:
c_compression = compression_type.NONE
c_compression = plc.io.types.CompressionType.NONE
else:
compression_map = {
"infer": compression_type.AUTO,
"gzip": compression_type.GZIP,
"bz2": compression_type.BZIP2,
"zip": compression_type.ZIP,
"infer": plc.io.types.CompressionType.AUTO,
"gzip": plc.io.types.CompressionType.GZIP,
"bz2": plc.io.types.CompressionType.BZIP2,
"zip": plc.io.types.CompressionType.ZIP,
}
c_compression = compression_map[compression]

Expand Down
38 changes: 15 additions & 23 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool

cimport pylibcudf.libcudf.io.types as cudf_io_types
from pylibcudf.io.types cimport compression_type
from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
from pylibcudf.libcudf.io.types cimport compression_type
from pylibcudf.libcudf.types cimport data_type, type_id
from pylibcudf.types cimport DataType

Expand All @@ -24,15 +20,6 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
import pylibcudf as plc


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
if on_bad_lines.lower() == "error":
return json_recovery_mode_t.FAIL
elif on_bad_lines.lower() == "recover":
return json_recovery_mode_t.RECOVER_WITH_NULL
else:
raise TypeError(f"Invalid parameter for {on_bad_lines=}")


cpdef read_json(object filepaths_or_buffers,
object dtype,
bool lines,
Expand All @@ -41,7 +28,7 @@ cpdef read_json(object filepaths_or_buffers,
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns,
object on_bad_lines):
str on_bad_lines):
"""
Cython function to call into libcudf API, see `read_json`.
Expand All @@ -64,19 +51,24 @@ cpdef read_json(object filepaths_or_buffers,
filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()

# Setup arguments
cdef cudf_io_types.compression_type c_compression

if compression is not None:
if compression == 'gzip':
c_compression = cudf_io_types.compression_type.GZIP
c_compression = plc.io.types.CompressionType.GZIP
elif compression == 'bz2':
c_compression = cudf_io_types.compression_type.BZIP2
c_compression = plc.io.types.CompressionType.BZIP2
elif compression == 'zip':
c_compression = cudf_io_types.compression_type.ZIP
c_compression = plc.io.types.CompressionType.ZIP
else:
c_compression = cudf_io_types.compression_type.AUTO
c_compression = plc.io.types.CompressionType.AUTO
else:
c_compression = plc.io.types.CompressionType.NONE

if on_bad_lines.lower() == "error":
c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL
elif on_bad_lines.lower() == "recover":
c_on_bad_lines = plc.io.types.JSONRecoveryMode.RECOVER_WITH_NULL
else:
c_compression = cudf_io_types.compression_type.NONE
raise TypeError(f"Invalid parameter for {on_bad_lines=}")

processed_dtypes = None

Expand Down Expand Up @@ -108,7 +100,7 @@ cpdef read_json(object filepaths_or_buffers,
keep_quotes = keep_quotes,
mixed_types_as_string = mixed_types_as_string,
prune_columns = prune_columns,
recovery_mode = _get_json_recovery_mode(on_bad_lines)
recovery_mode = c_on_bad_lines
)
df = cudf.DataFrame._from_data(
*_data_from_columns(
Expand All @@ -130,7 +122,7 @@ cpdef read_json(object filepaths_or_buffers,
keep_quotes = keep_quotes,
mixed_types_as_string = mixed_types_as_string,
prune_columns = prune_columns,
recovery_mode = _get_json_recovery_mode(on_bad_lines)
recovery_mode = c_on_bad_lines
)

df = cudf.DataFrame._from_data(
Expand Down
Loading

0 comments on commit ba3032a

Please sign in to comment.