diff --git a/docs/language.js b/docs/language.js index ab3d5819..5286df00 100644 --- a/docs/language.js +++ b/docs/language.js @@ -529,11 +529,11 @@ var data = [ }, { - "parameter" : "sort [* function] list l", + "parameter" : "sort [* function] list l [number k]", "output" : "list", "new value" : "new", "new target scope": true, - "description" : "Returns a new list containing the list with its elements sorted in increasing order. Numerical values come before strings, and code will be evaluated as the representative strings. If function is specified, it pushes a pair of new target scope onto the stack, so that current_value accesses a list of elements to from the list, and current_index accesses the list or assoc index if it is not already reduced, with target representing the original list or assoc, and evaluates function. The function should return a number, positive if \"(current_value)\" is greater, negative if \"(current_value 1)\" is greater, 0 if equal.", + "description" : "Returns a new list containing the list with its elements sorted in increasing order. Numerical values come before strings, and code will be evaluated as the representative strings. If function is specified and not null, it pushes a pair of new target scope onto the stack, so that current_value accesses a list of elements to from the list, and current_index accesses the list or assoc index if it is not already reduced, with target representing the original list or assoc, and evaluates function. The function should return a number, positive if \"(current_value)\" is greater, negative if \"(current_value 1)\" is greater, 0 if equal. If k is specified in addition to function, then it will only return the k smallest values sorted in order, or, if k is negative, it will ignore the negative sign and return the highest k values.", "example" : "(print (sort (list 4 9 3 5 1)))\n(print (sort (list \"n\" \"b\" \"hello\" 4 1 3.2 (list 1 2 3))))\n(print (sort (list 1 \"1x\" \"10\" 20 \"z2\" \"z10\" \"z100\")))\n(print (sort (lambda (- (current_value) (current_value 1))) (list 4 9 3 5 1)))" }, diff --git a/src/Amalgam/Parser.cpp b/src/Amalgam/Parser.cpp index 63951ed0..a90cb0d4 100644 --- a/src/Amalgam/Parser.cpp +++ b/src/Amalgam/Parser.cpp @@ -232,15 +232,15 @@ void Parser::SkipWhitespaceAndAccumulateAttributes(EvaluableNode *target) while(pos < code->size()) { //eat any whitespace - if(StringManipulation::IsUtf8Whitespace(*code, pos)) + if(size_t space_size = StringManipulation::IsUtf8Whitespace(*code, pos); space_size > 0) { - if(StringManipulation::IsUtf8Newline(*code, pos)) + if(StringManipulation::IsUtf8Newline(*code, pos) > 0) { lineNumber++; - lineStartPos = pos + 1; + lineStartPos = pos + space_size; } - pos++; + pos += space_size; continue; } diff --git a/src/Amalgam/amlg_code/full_test.amlg b/src/Amalgam/amlg_code/full_test.amlg index 5c6a4643..465ed9d7 100644 --- a/src/Amalgam/amlg_code/full_test.amlg +++ b/src/Amalgam/amlg_code/full_test.amlg @@ -868,6 +868,12 @@ "2020-06-08 lunes 11.33.46" ))) + (print (sort (null) (list 4 9 3 5 1) 2)) + (print (sort (null) (list 4 9 3 5 1) -2)) + + (print (sort (lambda (- (current_value) (current_value 1))) (list 4 9 3 5 1) 2)) + (print (sort (lambda (- (current_value) (current_value 1))) (list 4 9 3 5 1) -2)) + (print "--indices--\n") (print (indices (associate "a" 1 "b" 2 "c" 3 4 "d"))) (print (indices (list "a" 1 "b" 2 "c" 3 4 "d"))) diff --git a/src/Amalgam/amlg_code/test.amlg b/src/Amalgam/amlg_code/test.amlg index 64812390..d7d1cbdb 100644 --- a/src/Amalgam/amlg_code/test.amlg +++ b/src/Amalgam/amlg_code/test.amlg @@ -1,119 +1 @@ -(seq - - (print "40 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 4) ;attributes - (list - (assoc - a (assoc a 0.00744879 b 0.996275605 c 0.996275605) - b (assoc a 0.501736111 b 0.501736111 c 0.996527778) - c (assoc a 0.996539792 b 0.996539792 c 0.006920415) - ) - ) ;deviations - 1 ;p - (list "b") ;vector 1 - (list "c") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) - - (print "41 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 4) ;attributes - (list - (assoc - a (assoc a 0.00744879 b 0.996275605 c 0.996275605) - b (assoc a 0.501736111 b 0.501736111 c 0.996527778) - c (assoc a 0.996539792 b 0.996539792 c 0.006920415) - ) - ) ;deviations - 1 ;p - (list "b") ;vector 1 - (list "a") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) - - (print "42 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 4) ;attributes - (list - (assoc - a (assoc a 0.00744879 b 0.996275605 c 0.996275605) - b (list (assoc a 0.501736111 b 0.501736111 c 0.996527778) 0.8) - c (assoc a 0.996539792 b 0.996539792 c 0.006920415) - ) - ) ;deviations - 1 ;p - (list "b") ;vector 1 - (list "q") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) - - (print "43 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 2 2) ;attributes - (list - 0.2 - ) ;deviations - 1 ;p - (list "q") ;vector 1 - (list "u") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) - - (print "44 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 4) ;attributes - (list - (list (assoc - a (assoc a 0.00744879 b 0.996275605 c 0.996275605) - b (list (assoc a 0.501736111 b 0.501736111 c 0.996527778) 0.8) - c (assoc a 0.996539792 b 0.996539792 c 0.006920415) - ) - 0.2 - ) - ) ;deviations - 1 ;p - (list "q") ;vector 1 - (list "u") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) - - (print "45 " (generalized_distance - (list 1 1) ;weights - (list "nominal_string") ;types - (list 4) ;attributes - (list - (list (list - (assoc - a (assoc a 0.00744879 b 0.996275605 c 0.996275605) - b (list (assoc a 0.501736111 b 0.501736111 c 0.996527778) 0.8) - c (assoc a 0.996539792 b 0.996539792 c 0.006920415) - ) - 0.2 - ) - 0.2 - ) - ) ;deviations - 1 ;p - (list "q") ;vector 1 - (list "u") ;vector 2 - (null) ;names - (true) ;surprisal - ) "\n" - ) -) +(6) \ No newline at end of file diff --git a/src/Amalgam/entity/EntityQueriesStatistics.h b/src/Amalgam/entity/EntityQueriesStatistics.h index 657336ad..4d2dd801 100644 --- a/src/Amalgam/entity/EntityQueriesStatistics.h +++ b/src/Amalgam/entity/EntityQueriesStatistics.h @@ -272,7 +272,7 @@ class EntityQueriesStatistics if(FastIsNaN(q_percentage) || q_percentage < 0.0 || q_percentage > 1.0) return std::numeric_limits::quiet_NaN(); - std::vector>& value_weights = values_buffer; + std::vector> &value_weights = values_buffer; value_weights.clear(); double total_weight = 0.0; bool eq_or_no_weights = true; diff --git a/src/Amalgam/evaluablenode/EvaluableNode.h b/src/Amalgam/evaluablenode/EvaluableNode.h index 0d62dbed..c25a0184 100644 --- a/src/Amalgam/evaluablenode/EvaluableNode.h +++ b/src/Amalgam/evaluablenode/EvaluableNode.h @@ -274,6 +274,11 @@ class EvaluableNode return IsLessThan(a, b, false); } + static inline bool IsStrictlyGreaterThan(EvaluableNode *a, EvaluableNode *b) + { + return !IsLessThan(a, b, true); + } + //if the node's contents can be represented as a number, which includes numbers, infinity, and even null and NaN, then return true // otherwise returns false static constexpr bool CanRepresentValueAsANumber(EvaluableNode *e) diff --git a/src/Amalgam/interpreter/InterpreterOpcodesTransformations.cpp b/src/Amalgam/interpreter/InterpreterOpcodesTransformations.cpp index bfa2bf90..5063f1fa 100644 --- a/src/Amalgam/interpreter/InterpreterOpcodesTransformations.cpp +++ b/src/Amalgam/interpreter/InterpreterOpcodesTransformations.cpp @@ -855,31 +855,77 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_SORT(EvaluableNode *en, bo if(ocn.size() < 1) return EvaluableNodeReference::Null(); - if(ocn.size() == 1) + size_t list_index = (ocn.size() == 1 ? 0 : 1); + + EvaluableNodeReference function; + size_t highest_k = 0; + size_t lowest_k = 0; + + if(ocn.size() == 3) + { + double k = InterpretNodeIntoNumberValue(ocn[2]); + if(k > 0) + lowest_k = static_cast(k); + else if(k < 0) + highest_k = static_cast(-k); + //else nan, leave both as zero + } + + if(ocn.size() >= 2) + function = InterpretNodeForImmediateUse(ocn[0]); + + if(EvaluableNode::IsNull(function)) { //get list - auto list = InterpretNode(ocn[0]); + auto list = InterpretNode(ocn[list_index]); if(list == nullptr) return EvaluableNodeReference::Null(); //make sure it is an editable copy evaluableNodeManager->EnsureNodeIsModifiable(list); - std::sort(begin(list->GetOrderedChildNodes()), end(list->GetOrderedChildNodes()), EvaluableNode::IsStrictlyLessThan); + auto &list_ocn = list->GetOrderedChildNodes(); + + if(highest_k > 0 && highest_k < list_ocn.size()) + { + std::partial_sort(begin(list_ocn), + begin(list_ocn) + highest_k, + end(list_ocn), EvaluableNode::IsStrictlyGreaterThan); + + if(list.unique && !list->GetNeedCycleCheck()) + { + for(size_t i = highest_k; i < list_ocn.size(); i++) + evaluableNodeManager->FreeNodeTree(list_ocn[i]); + } + + list_ocn.erase(begin(list_ocn) + highest_k, end(list_ocn)); + } + else if(lowest_k > 0 && lowest_k < list_ocn.size()) + { + std::partial_sort(begin(list_ocn), begin(list_ocn) + lowest_k, + end(list_ocn), EvaluableNode::IsStrictlyLessThan); + + if(list.unique && !list->GetNeedCycleCheck()) + { + for(size_t i = lowest_k; i < list_ocn.size(); i++) + evaluableNodeManager->FreeNodeTree(list_ocn[i]); + } + + list_ocn.erase(begin(list_ocn) + lowest_k, end(list_ocn)); + } + else + { + std::sort(begin(list_ocn), end(list_ocn), EvaluableNode::IsStrictlyLessThan); + } return list; } else { - //get function to apply to list - auto function = InterpretNodeForImmediateUse(ocn[0]); - if(function == nullptr) - return EvaluableNodeReference::Null(); - auto node_stack = CreateInterpreterNodeStackStateSaver(function); - + //get list - auto list = InterpretNode(ocn[1]); + auto list = InterpretNode(ocn[list_index]); if(list == nullptr) return EvaluableNodeReference::Null(); @@ -891,6 +937,17 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_SORT(EvaluableNode *en, bo //sort list; can't use the C++ sort function because it requires weak ordering and will crash otherwise // the custom comparator does not guarantee this std::vector sorted = CustomEvaluableNodeOrderedChildNodesSort(list->GetOrderedChildNodes(), comparator); + + if(highest_k > 0 && highest_k < sorted.size()) + { + sorted.erase(begin(sorted), begin(sorted) + (sorted.size() - highest_k)); + std::reverse(begin(sorted), end(sorted)); + } + else if(lowest_k > 0 && lowest_k < sorted.size()) + { + sorted.erase(begin(sorted) + lowest_k, end(sorted)); + } + list->SetOrderedChildNodes(sorted); return list; diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index f0074bdd..d2452646 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -1018,6 +1018,10 @@ abcdef "2020-06-08 lunes 11.33.47" "2020-06-08 lunes 11.33.48" ) +(list 1 3) +(list 9 5) +(list 1 3) +(list 9 5) --indices-- (list "b" "4" "a" "c") (list @@ -1249,7 +1253,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1262,10 +1266,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1716570941.005401 + start_time 1716915936.050118 www 1 x 12 zz 10 @@ -1292,7 +1296,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1305,10 +1309,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1716570941.005401 + start_time 1716915936.050118 www 1 x 12 zz 10 @@ -1334,7 +1338,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1347,10 +1351,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1716570941.005401 + start_time 1716915936.050118 www 1 x 12 zz 10 @@ -1618,7 +1622,7 @@ e: - .inf 25: (assoc a 1) -current date-time in epoch: 2024-05-24-13.15.41.2972120 +current date-time in epoch: 2024-05-28-13.05.36.1075120 2020-06-07 00:22:59 1391230800 1391230800 @@ -3438,7 +3442,7 @@ deep sets --set_entity_root_permission-- RootTest -1716570941.565219 +1716915936.289861 (true) RootTest @@ -4681,4 +4685,4 @@ concurrent entity writes successful: (true) --clean-up test files-- --total execution time-- -1.186690092086792 +1.9868049621582031 diff --git a/src/Amalgam/string/StringManipulation.h b/src/Amalgam/string/StringManipulation.h index 3a9a09f4..0332cd8e 100644 --- a/src/Amalgam/string/StringManipulation.h +++ b/src/Amalgam/string/StringManipulation.h @@ -24,13 +24,14 @@ namespace StringManipulation //to only contain the portion of the string after the removed section std::vector SplitArgString(std::string &arg_string, bool greedy = true); - //returns true if the character in the string s starting at position is whitespace - inline bool IsUtf8Whitespace(std::string &s, size_t position) + //returns the number of bytes wide the character in position of string s is if it is whitespace, + // 0 if it is not a newline + inline size_t IsUtf8Whitespace(std::string &s, size_t position) { auto cur_char = s[position]; if(cur_char == '\t' || cur_char == '\n' || cur_char == '\v' || cur_char == '\f' - || cur_char == '\r' || cur_char == ' ') - return true; + || cur_char == '\r' || cur_char == ' ') + return 1; //need to additionally check the following multicharacter utf-8 code points: //name hex dec bytes @@ -53,19 +54,21 @@ namespace StringManipulation // medium mathematical space U + 205F 8287 0xE2 0x81 0x9F // ideographic space U + 3000 12288 0xE3 0x80 0x80 + //need at least 2 characters for the remaining whitespace possibilities if(position + 2 >= s.size()) - return false; + return 0; if(static_cast(cur_char) == 0xC2 && static_cast(s[position + 1]) == 0xA0) - return true; + return 2; - //need 3 characters for the remaining + //need 3 characters for the remaining whitespace possibilities if(position + 3 >= s.size()) - return false; + return 0; - if(static_cast(cur_char) == 0xE1 && static_cast(s[position + 1]) == 0x9A - && static_cast(s[position + 2]) == 0x80) - return true; + if(static_cast(cur_char) == 0xE1 + && static_cast(s[position + 1]) == 0x9A + && static_cast(s[position + 2]) == 0x80) + return 3; if(static_cast(cur_char) == 0xE2) { @@ -73,19 +76,20 @@ namespace StringManipulation { uint8_t third_char = s[position + 2]; if(third_char >= 0x80 && third_char <= 0xAF) - return true; + return 3; } else if(static_cast(s[position + 1]) == 0x81 && static_cast(s[position + 2]) == 0x9F) { - return true; + return 3; } } - if(static_cast(cur_char) == 0xE3 && static_cast(s[position + 1]) == 0x80 - && static_cast(s[position + 2]) == 0x80) - return true; + if(static_cast(cur_char) == 0xE3 + && static_cast(s[position + 1]) == 0x80 + && static_cast(s[position + 2]) == 0x80) + return 3; - return false; + return 0; } //returns true if c is a numeric digit @@ -94,13 +98,14 @@ namespace StringManipulation return (c >= '0' && c <= '9'); } - //returns true if the character in the string s starting at position is a newline - inline bool IsUtf8Newline(std::string &s, size_t position) + //returns the number of bytes wide the character in position of string s is if it is a newline, + // 0 if it is not a newline + inline size_t IsUtf8Newline(std::string &s, size_t position) { auto cur_char = s[position]; //don't count carriage returns (\r) as new lines, since it just moves the cursor if(cur_char == '\n' || cur_char == '\v' || cur_char == '\f') - return true; + return 1; if(position + 3 < s.size()) { @@ -108,14 +113,14 @@ namespace StringManipulation { //line separator if(static_cast(s[position + 1]) == 0x80 && static_cast(s[position + 2]) == 0xA8) - return true; + return 3; //paragraph separator else if(static_cast(s[position + 1]) == 0x80 && static_cast(s[position + 2]) == 0xA9) - return true; + return 3; } } - return false; + return 0; } //returns the length of the UTF-8 character in s starting at the specified offset