-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement
fast_float
for String#to_f
- Loading branch information
1 parent
caf57c2
commit 6e96919
Showing
13 changed files
with
2,735 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
require "spec" | ||
|
||
# Exhaustively checks that for all 4294967296 possible `Float32` values, | ||
# `to_s.to_f32` returns the original number. Splits the floats into 4096 bins | ||
# for better progress tracking. Also useful as a sort of benchmark. | ||
# | ||
# This was originally added when `String#to_f` moved from `LibC.strtod` to | ||
# `fast_float`, but is applicable to any other implementation as well. | ||
describe "x.to_s.to_f32 == x" do | ||
(0_u32..0xFFF_u32).each do |i| | ||
it "%03x00000..%03xfffff" % {i, i} do | ||
0x100000.times do |j| | ||
bits = i << 20 | j | ||
float = bits.unsafe_as(Float32) | ||
str = float.to_s | ||
val = str.to_f32?.should_not be_nil | ||
|
||
if float.nan? | ||
val.nan?.should be_true | ||
else | ||
val.should eq(float) | ||
Math.copysign(1, val).should eq(Math.copysign(1, float)) | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
require "spec" | ||
|
||
describe String do | ||
# TODO | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
struct Float | ||
# :nodoc: | ||
# Source port of the floating-point part of fast_float for C++: | ||
# https://github.com/fastfloat/fast_float | ||
# | ||
# fast_float implements the C++17 `std::from_chars`, which accepts a subset of | ||
# the C `strtod` / `strtof`'s string format: | ||
# | ||
# - a leading plus sign is disallowed, but both fast_float and this port | ||
# accept it; | ||
# - the exponent may be required or disallowed, depending on the format | ||
# argument (this port always allows both); | ||
# - hexfloats are not enabled by default, and fast_float doesn't implement it; | ||
# (https://github.com/fastfloat/fast_float/issues/124) | ||
# - hexfloats cannot start with `0x` or `0X`. | ||
# | ||
# The following is their license: | ||
# | ||
# Licensed under either of Apache License, Version 2.0 or MIT license or | ||
# BOOST license. | ||
# | ||
# Unless you explicitly state otherwise, any contribution intentionally | ||
# submitted for inclusion in this repository by you, as defined in the | ||
# Apache-2.0 license, shall be triple licensed as above, without any | ||
# additional terms or conditions. | ||
# | ||
# Main differences from the original fast_float: | ||
# | ||
# - Only `UC == UInt8` is implemented and tested, not the other wide chars; | ||
# - No explicit SIMD (the original mainly uses this for wide char strings). | ||
# | ||
# The following compile-time configuration is assumed: | ||
# | ||
# - #define FASTFLOAT_ALLOWS_LEADING_PLUS | ||
# - #define FLT_EVAL_METHOD 0 | ||
module FastFloat | ||
# Current revision: https://github.com/fastfloat/fast_float/tree/v6.1.6 | ||
|
||
def self.to_f64?(str : String, whitespace : Bool, strict : Bool) : Float64? | ||
value = uninitialized Float64 | ||
start = str.to_unsafe | ||
finish = start + str.bytesize | ||
options = ParseOptionsT(typeof(str.to_unsafe.value)).new(format: :general) | ||
|
||
ret = BinaryFormat_Float64.new.from_chars_advanced(start, finish, pointerof(value), options, whitespace: whitespace) | ||
if ret.ec == Errno::NONE | ||
if trailing_chars_allowed?(ret.ptr, finish, whitespace, strict) | ||
value | ||
end | ||
end | ||
end | ||
|
||
def self.to_f32?(str : String, whitespace : Bool, strict : Bool) : Float32? | ||
value = uninitialized Float32 | ||
start = str.to_unsafe | ||
finish = start + str.bytesize | ||
options = ParseOptionsT(typeof(str.to_unsafe.value)).new(format: :general) | ||
|
||
ret = BinaryFormat_Float32.new.from_chars_advanced(start, finish, pointerof(value), options, whitespace: whitespace) | ||
if ret.ec == Errno::NONE | ||
if trailing_chars_allowed?(ret.ptr, finish, whitespace, strict) | ||
value | ||
end | ||
end | ||
end | ||
|
||
private def self.trailing_chars_allowed?(ptr, finish, whitespace, strict) | ||
if strict | ||
if whitespace | ||
while ptr < finish && ptr.value.unsafe_chr.ascii_whitespace? | ||
ptr += 1 | ||
end | ||
end | ||
ptr == finish | ||
else | ||
true | ||
end | ||
end | ||
end | ||
end | ||
|
||
require "./fast_float/parse_number" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,270 @@ | ||
require "./float_common" | ||
|
||
module Float::FastFloat | ||
# Next function can be micro-optimized, but compilers are entirely able to | ||
# optimize it well. | ||
def self.is_integer?(c : UC) : Bool forall UC | ||
!(c > '9'.ord || c < '0'.ord) | ||
end | ||
|
||
# Read 8 UC into a u64. Truncates UC if not char. | ||
def self.read8_to_u64(chars : UC*) : UInt64 forall UC | ||
val = uninitialized UInt64 | ||
chars.as(UInt8*).copy_to(pointerof(val).as(UInt8*), sizeof(UInt64)) | ||
{% if IO::ByteFormat::SystemEndian == IO::ByteFormat::BigEndian %} | ||
val.byte_swap | ||
{% else %} | ||
val | ||
{% end %} | ||
end | ||
|
||
# credit @aqrit | ||
def self.parse_eight_digits_unrolled(val : UInt64) : UInt32 | ||
mask = 0x000000FF000000FF_u64 | ||
mul1 = 0x000F424000000064_u64 # 100 + (1000000ULL << 32) | ||
mul2 = 0x0000271000000001_u64 # 1 + (10000ULL << 32) | ||
val &-= 0x3030303030303030 | ||
val = (val &* 10) &+ val.unsafe_shr(8) # val = (val * 2561) >> 8 | ||
val = (((val & mask) &* mul1) &+ ((val.unsafe_shr(16) & mask) &* mul2)).unsafe_shr(32) | ||
val.to_u32! | ||
end | ||
|
||
# Call this if chars are definitely 8 digits. | ||
def self.parse_eight_digits_unrolled(chars : UC*) : UInt32 forall UC | ||
parse_eight_digits_unrolled(read8_to_u64(chars)) | ||
end | ||
|
||
# credit @aqrit | ||
def self.is_made_of_eight_digits_fast?(val : UInt64) : Bool | ||
((val &+ 0x4646464646464646_u64) | (val &- 0x3030303030303030_u64)) & 0x8080808080808080_u64 == 0 | ||
end | ||
|
||
# NOTE(crystal): returns {p, i} | ||
def self.loop_parse_if_eight_digits(p : UInt8*, pend : UInt8*, i : UInt64) : {UInt8*, UInt64} | ||
# optimizes better than parse_if_eight_digits_unrolled() for UC = char. | ||
while pend - p >= 8 && is_made_of_eight_digits_fast?(read8_to_u64(p)) | ||
i = i &* 100000000 &+ parse_eight_digits_unrolled(read8_to_u64(p)) # in rare cases, this will overflow, but that's ok | ||
p += 8 | ||
end | ||
{p, i} | ||
end | ||
|
||
enum ParseError | ||
NoError | ||
|
||
# [JSON-only] The minus sign must be followed by an integer. | ||
MissingIntegerAfterSign | ||
|
||
# A sign must be followed by an integer or dot. | ||
MissingIntegerOrDotAfterSign | ||
|
||
# [JSON-only] The integer part must not have leading zeros. | ||
LeadingZerosInIntegerPart | ||
|
||
# [JSON-only] The integer part must have at least one digit. | ||
NoDigitsInIntegerPart | ||
|
||
# [JSON-only] If there is a decimal point, there must be digits in the | ||
# fractional part. | ||
NoDigitsInFractionalPart | ||
|
||
# The mantissa must have at least one digit. | ||
NoDigitsInMantissa | ||
|
||
# Scientific notation requires an exponential part. | ||
MissingExponentialPart | ||
end | ||
|
||
struct ParsedNumberStringT(UC) | ||
property exponent : Int64 = 0 | ||
property mantissa : UInt64 = 0 | ||
property lastmatch : UC* = Pointer(UC).null | ||
property negative : Bool = false | ||
property valid : Bool = false | ||
property too_many_digits : Bool = false | ||
# contains the range of the significant digits | ||
property integer : Slice(UC) = Slice(UC).empty # non-nullable | ||
property fraction : Slice(UC) = Slice(UC).empty # nullable | ||
property error : ParseError = :no_error | ||
end | ||
|
||
alias ByteSpan = ::Bytes | ||
alias ParsedNumberString = ParsedNumberStringT(UInt8) | ||
|
||
def self.report_parse_error(p : UC*, error : ParseError) : ParsedNumberStringT(UC) forall UC | ||
answer = ParsedNumberStringT(UC).new | ||
answer.valid = false | ||
answer.lastmatch = p | ||
answer.error = error | ||
answer | ||
end | ||
|
||
# Assuming that you use no more than 19 digits, this will parse an ASCII | ||
# string. | ||
def self.parse_number_string(p : UC*, pend : UC*, options : ParseOptionsT(UC)) : ParsedNumberStringT(UC) forall UC | ||
fmt = options.format | ||
decimal_point = options.decimal_point | ||
|
||
answer = ParsedNumberStringT(UInt8).new | ||
answer.valid = false | ||
answer.too_many_digits = false | ||
answer.negative = p.value === '-' | ||
|
||
if p.value === '-' || (!fmt.json_fmt? && p.value === '+') | ||
p += 1 | ||
if p == pend | ||
return report_parse_error(p, :missing_integer_or_dot_after_sign) | ||
end | ||
if fmt.json_fmt? | ||
if !is_integer?(p.value) # a sign must be followed by an integer | ||
return report_parse_error(p, :missing_integer_after_sign) | ||
end | ||
else | ||
if !is_integer?(p.value) && p.value != decimal_point # a sign must be followed by an integer or the dot | ||
return report_parse_error(p, :missing_integer_or_dot_after_sign) | ||
end | ||
end | ||
end | ||
start_digits = p | ||
|
||
i = 0_u64 # an unsigned int avoids signed overflows (which are bad) | ||
|
||
while p != pend && is_integer?(p.value) | ||
# a multiplication by 10 is cheaper than an arbitrary integer multiplication | ||
i = i &* 10 &+ (p.value &- '0'.ord).to_u64! # might overflow, we will handle the overflow later | ||
p += 1 | ||
end | ||
end_of_integer_part = p | ||
digit_count = (end_of_integer_part - start_digits).to_i64! | ||
answer.integer = Slice.new(start_digits, digit_count.to_i32) | ||
if fmt.json_fmt? | ||
# at least 1 digit in integer part, without leading zeros | ||
if digit_count == 0 | ||
return report_parse_error(p, :no_digits_in_integer_part) | ||
end | ||
if start_digits[0] === '0' && digit_count > 1 | ||
return report_parse_error(p, :leading_zeros_in_integer_part) | ||
end | ||
end | ||
|
||
exponent = 0_i64 | ||
has_decimal_point = p != pend && p.value == decimal_point | ||
if has_decimal_point | ||
p += 1 | ||
before = p | ||
# can occur at most twice without overflowing, but let it occur more, since | ||
# for integers with many digits, digit parsing is the primary bottleneck. | ||
p, i = loop_parse_if_eight_digits(p, pend, i) | ||
|
||
while p != pend && is_integer?(p.value) | ||
digit = (p.value &- '0'.ord).to_u8! | ||
p += 1 | ||
i = i &* 10 &+ digit # in rare cases, this will overflow, but that's ok | ||
end | ||
exponent = before - p | ||
answer.fraction = Slice.new(before, (p - before).to_i32) | ||
digit_count &-= exponent | ||
end | ||
if fmt.json_fmt? | ||
# at least 1 digit in fractional part | ||
if has_decimal_point && exponent == 0 | ||
return report_parse_error(p, :no_digits_in_fractional_part) | ||
end | ||
elsif digit_count == 0 # we must have encountered at least one integer! | ||
return report_parse_error(p, :no_digits_in_mantissa) | ||
end | ||
exp_number = 0_i64 # explicit exponential part | ||
if (fmt.scientific? && p != pend && p.value.unsafe_chr.in?('e', 'E')) || | ||
(fmt.fortran_fmt? && p != pend && p.value.unsafe_chr.in?('+', '-', 'd', 'D')) | ||
location_of_e = p | ||
if p.value.unsafe_chr.in?('e', 'E', 'd', 'D') | ||
p += 1 | ||
end | ||
neg_exp = false | ||
if p != pend && p.value === '-' | ||
neg_exp = true | ||
p += 1 | ||
elsif p != pend && p.value === '+' # '+' on exponent is allowed by C++17 20.19.3.(7.1) | ||
p += 1 | ||
end | ||
if p == pend || !is_integer?(p.value) | ||
if !fmt.fixed? | ||
# The exponential part is invalid for scientific notation, so it must | ||
# be a trailing token for fixed notation. However, fixed notation is | ||
# disabled, so report a scientific notation error. | ||
return report_parse_error(p, :missing_exponential_part) | ||
end | ||
# Otherwise, we will be ignoring the 'e'. | ||
p = location_of_e | ||
else | ||
while p != pend && is_integer?(p.value) | ||
digit = (p.value &- '0'.ord).to_u8! | ||
if exp_number < 0x10000000 | ||
exp_number = exp_number &* 10 &+ digit | ||
end | ||
p += 1 | ||
end | ||
if neg_exp | ||
exp_number = 0_i64 &- exp_number | ||
end | ||
exponent &+= exp_number | ||
end | ||
else | ||
# If it scientific and not fixed, we have to bail out. | ||
if fmt.scientific? && !fmt.fixed? | ||
return report_parse_error(p, :missing_exponential_part) | ||
end | ||
end | ||
answer.lastmatch = p | ||
answer.valid = true | ||
|
||
# If we frequently had to deal with long strings of digits, | ||
# we could extend our code by using a 128-bit integer instead | ||
# of a 64-bit integer. However, this is uncommon. | ||
# | ||
# We can deal with up to 19 digits. | ||
if digit_count > 19 # this is uncommon | ||
# It is possible that the integer had an overflow. | ||
# We have to handle the case where we have 0.0000somenumber. | ||
# We need to be mindful of the case where we only have zeroes... | ||
# E.g., 0.000000000...000. | ||
start = start_digits | ||
while start != pend && (start.value === '0' || start.value == decimal_point) | ||
if start.value === '0' | ||
digit_count &-= 1 | ||
end | ||
start += 1 | ||
end | ||
|
||
if digit_count > 19 | ||
answer.too_many_digits = true | ||
# Let us start again, this time, avoiding overflows. | ||
# We don't need to check if is_integer, since we use the | ||
# pre-tokenized spans from above. | ||
i = 0_u64 | ||
p = answer.integer.to_unsafe | ||
int_end = p + answer.integer.size | ||
minimal_nineteen_digit_integer = 1000000000000000000_u64 | ||
while i < minimal_nineteen_digit_integer && p != int_end | ||
i = i &* 10 &+ (p.value - '0'.ord).to_u64! | ||
p += 1 | ||
end | ||
if i >= minimal_nineteen_digit_integer # We have a big integers | ||
exponent = (end_of_integer_part - p) &+ exp_number | ||
else # We have a value with a fractional component. | ||
p = answer.fraction.to_unsafe | ||
frac_end = p + answer.fraction.size | ||
while i < minimal_nineteen_digit_integer && p != frac_end | ||
i = i &* 10 &+ (p.value - '0'.ord).to_u64! | ||
p += 1 | ||
end | ||
exponent = (answer.fraction.to_unsafe - p) &+ exp_number | ||
end | ||
# We have now corrected both exponent and i, to a truncated value | ||
end | ||
end | ||
answer.exponent = exponent | ||
answer.mantissa = i | ||
answer | ||
end | ||
end |
Oops, something went wrong.