From c92591373a330fe2dca120c09879f829e76098c2 Mon Sep 17 00:00:00 2001
From: howsohazard <143410553+howsohazard@users.noreply.github.com>
Date: Mon, 4 Dec 2023 12:03:32 -0500
Subject: [PATCH] 18578: Improves performance by tightening inner interpreter
 loops and hash map functions, evens out garbage collection thresholds (#39)

Co-authored-by: J. Caleb Wherry <337871+calebwherry@users.noreply.github.com>
---
 src/3rd_party/skarupke_maps/flat_hash_map.hpp | 146 +++++++++---------
 src/Amalgam/ThreadPool.cpp                    |  20 ++-
 src/Amalgam/ThreadPool.h                      |  11 +-
 .../evaluablenode/EvaluableNodeManagement.cpp |  42 +----
 .../evaluablenode/EvaluableNodeManagement.h   |  55 ++++++-
 src/Amalgam/interpreter/Interpreter.cpp       |  12 +-
 src/Amalgam/interpreter/Interpreter.h         |  27 ++--
 7 files changed, 176 insertions(+), 137 deletions(-)
diff --git a/src/3rd_party/skarupke_maps/flat_hash_map.hpp b/src/3rd_party/skarupke_maps/flat_hash_map.hpp
index 222fc901..13d1a38c 100644
--- a/src/3rd_party/skarupke_maps/flat_hash_map.hpp
+++ b/src/3rd_party/skarupke_maps/flat_hash_map.hpp
@@ -47,12 +47,12 @@ struct functor_storage : Functor
     {
     }
     template<typename... Args>
-    Result operator()(Args &&... args)
+    inline Result operator()(Args &&... args)
     {
         return static_cast<Functor &>(*this)(std::forward<Args>(args)...);
     }
     template<typename... Args>
-    Result operator()(Args &&... args) const
+    inline Result operator()(Args &&... args) const
     {
         return static_cast<const Functor &>(*this)(std::forward<Args>(args)...);
     }
@@ -66,15 +66,15 @@ struct functor_storage<Result, Result (*)(Args...)>
         : function(function)
     {
     }
-    Result operator()(Args... args) const
+    inline Result operator()(Args... args) const
     {
         return function(std::forward<Args>(args)...);
     }
-    operator function_ptr &()
+    inline operator function_ptr &()
     {
         return function;
     }
-    operator const function_ptr &()
+    inline operator const function_ptr &()
     {
         return function;
     }
@@ -88,29 +88,29 @@ struct KeyOrValueHasher : functor_storage<size_t, hasher>
         : hasher_storage(hash)
     {
     }
-    size_t operator()(const key_type & key)
+    inline size_t operator()(const key_type & key)
     {
         return static_cast<hasher_storage &>(*this)(key);
     }
-    size_t operator()(const key_type & key) const
+    inline size_t operator()(const key_type & key) const
     {
         return static_cast<const hasher_storage &>(*this)(key);
     }
-    size_t operator()(const value_type & value)
+    inline size_t operator()(const value_type & value)
     {
         return static_cast<hasher_storage &>(*this)(value.first);
     }
-    size_t operator()(const value_type & value) const
+    inline size_t operator()(const value_type & value) const
     {
         return static_cast<const hasher_storage &>(*this)(value.first);
     }
     template<typename F, typename S>
-    size_t operator()(const std::pair<F, S> & value)
+    inline size_t operator()(const std::pair<F, S> & value)
     {
         return static_cast<hasher_storage &>(*this)(value.first);
     }
     template<typename F, typename S>
-    size_t operator()(const std::pair<F, S> & value) const
+    inline size_t operator()(const std::pair<F, S> & value) const
     {
         return static_cast<const hasher_storage &>(*this)(value.first);
     }
@@ -124,44 +124,44 @@ struct KeyOrValueEquality : functor_storage<bool, key_equal>
         : equality_storage(equality)
     {
     }
-    bool operator()(const key_type & lhs, const key_type & rhs)
+    inline bool operator()(const key_type & lhs, const key_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs, rhs);
     }
-    bool operator()(const key_type & lhs, const value_type & rhs)
+    inline bool operator()(const key_type & lhs, const value_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs, rhs.first);
     }
-    bool operator()(const value_type & lhs, const key_type & rhs)
+    inline bool operator()(const value_type & lhs, const key_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs);
     }
-    bool operator()(const value_type & lhs, const value_type & rhs)
+    inline bool operator()(const value_type & lhs, const value_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
     }
     template<typename F, typename S>
-    bool operator()(const key_type & lhs, const std::pair<F, S> & rhs)
+    inline bool operator()(const key_type & lhs, const std::pair<F, S> & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs, rhs.first);
     }
     template<typename F, typename S>
-    bool operator()(const std::pair<F, S> & lhs, const key_type & rhs)
+    inline bool operator()(const std::pair<F, S> & lhs, const key_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs);
     }
     template<typename F, typename S>
-    bool operator()(const value_type & lhs, const std::pair<F, S> & rhs)
+    inline bool operator()(const value_type & lhs, const std::pair<F, S> & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
     }
     template<typename F, typename S>
-    bool operator()(const std::pair<F, S> & lhs, const value_type & rhs)
+    inline bool operator()(const std::pair<F, S> & lhs, const value_type & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
     }
     template<typename FL, typename SL, typename FR, typename SR>
-    bool operator()(const std::pair<FL, SL> & lhs, const std::pair<FR, SR> & rhs)
+    inline bool operator()(const std::pair<FL, SL> & lhs, const std::pair<FR, SR> & rhs)
     {
         return static_cast<equality_storage &>(*this)(lhs.first, rhs.first);
     }
@@ -170,10 +170,10 @@ static constexpr int8_t min_lookups = 4;
 template<typename T>
 struct sherwood_v3_entry
 {
-    sherwood_v3_entry()
+    inline sherwood_v3_entry()
     {
     }
-    sherwood_v3_entry(int8_t distance_from_desired)
+    inline sherwood_v3_entry(int8_t distance_from_desired)
         : distance_from_desired(distance_from_desired)
     {
     }
@@ -186,15 +186,15 @@ struct sherwood_v3_entry
         return result;
     }
 
-    bool has_value() const
+    inline bool has_value() const
     {
         return distance_from_desired >= 0;
     }
-    bool is_empty() const
+    inline bool is_empty() const
     {
         return distance_from_desired < 0;
     }
-    bool is_at_desired_position() const
+    inline bool is_at_desired_position() const
     {
         return distance_from_desired <= 0;
     }
@@ -205,7 +205,7 @@ struct sherwood_v3_entry
         distance_from_desired = distance;
     }
 
-    void destroy_value()
+    inline void destroy_value()
     {
         value.~T();
         distance_from_desired = -1;
@@ -298,7 +298,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
 public:
 
 #ifdef FLAT_HASH_MAP_AMALGAM_MEM_REDUCTION
-	static constexpr float _max_load_factor = 0.5f;
+    static constexpr float _max_load_factor = 0.5f;
 #endif
 
     using value_type = T;
@@ -370,7 +370,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
     sherwood_v3_table(const sherwood_v3_table & other, const ArgumentAlloc & alloc)
         : EntryAlloc(alloc), Hasher(other), Equal(other)
 #ifndef FLAT_HASH_MAP_AMALGAM_MEM_REDUCTION
-		, _max_load_factor(other._max_load_factor)
+        , _max_load_factor(other._max_load_factor)
 #endif
     {
         rehash_for_other_container(other);
@@ -454,15 +454,15 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         deallocate_data(entries, num_slots_minus_one, max_lookups);
     }
 
-    const allocator_type & get_allocator() const
+    inline const allocator_type & get_allocator() const
     {
         return static_cast<const allocator_type &>(*this);
     }
-    const ArgumentEqual & key_eq() const
+    inline const ArgumentEqual & key_eq() const
     {
         return static_cast<const ArgumentEqual &>(*this);
     }
-    const ArgumentHash & hash_function() const
+    inline const ArgumentHash & hash_function() const
     {
         return static_cast<const ArgumentHash &>(*this);
     }
@@ -483,11 +483,11 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         using pointer = ValueType *;
         using reference = ValueType &;
 
-        friend bool operator==(const templated_iterator & lhs, const templated_iterator & rhs)
+        inline friend bool operator==(const templated_iterator & lhs, const templated_iterator & rhs)
         {
             return lhs.current == rhs.current;
         }
-        friend bool operator!=(const templated_iterator & lhs, const templated_iterator & rhs)
+        inline friend bool operator!=(const templated_iterator & lhs, const templated_iterator & rhs)
         {
             return !(lhs == rhs);
         }
@@ -508,16 +508,16 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
             return copy;
         }
 
-        ValueType & operator*() const
+        inline ValueType & operator*() const
         {
             return current->value;
         }
-        ValueType * operator->() const
+        inline ValueType * operator->() const
         {
             return std::addressof(current->value);
         }
 
-        operator templated_iterator<const value_type>() const
+        inline operator templated_iterator<const value_type>() const
         {
             return { current };
         }
@@ -541,7 +541,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
                 return { it };
         }
     }
-    const_iterator cbegin() const
+    inline const_iterator cbegin() const
     {
         return begin();
     }
@@ -553,7 +553,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
     {
         return { entries + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups) };
     }
-    const_iterator cend() const
+    inline const_iterator cend() const
     {
         return end();
     }
@@ -569,11 +569,11 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         }
         return end();
     }
-    const_iterator find(const FindKey & key) const
+    inline const_iterator find(const FindKey & key) const
     {
         return const_cast<sherwood_v3_table *>(this)->find(key);
     }
-    size_t count(const FindKey & key) const
+    inline size_t count(const FindKey & key) const
     {
         return find(key) == end() ? 0 : 1;
     }
@@ -608,37 +608,37 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         return emplace_new_key(distance_from_desired, current_entry, std::forward<Key>(key), std::forward<Args>(args)...);
     }
 
-    std::pair<iterator, bool> insert(const value_type & value)
+    inline std::pair<iterator, bool> insert(const value_type & value)
     {
         return emplace(value);
     }
-    std::pair<iterator, bool> insert(value_type && value)
+    inline std::pair<iterator, bool> insert(value_type && value)
     {
         return emplace(std::move(value));
     }
     template<typename... Args>
-    iterator emplace_hint(const_iterator, Args &&... args)
+    inline iterator emplace_hint(const_iterator, Args &&... args)
     {
         return emplace(std::forward<Args>(args)...).first;
     }
-    iterator insert(const_iterator, const value_type & value)
+    inline iterator insert(const_iterator, const value_type & value)
     {
         return emplace(value).first;
     }
-    iterator insert(const_iterator, value_type && value)
+    inline iterator insert(const_iterator, value_type && value)
     {
         return emplace(std::move(value)).first;
     }
 
     template<typename It>
-    void insert(It begin, It end)
+    inline void insert(It begin, It end)
     {
         for (; begin != end; ++begin)
         {
             emplace(*begin);
         }
     }
-    void insert(std::initializer_list<value_type> il)
+    inline void insert(std::initializer_list<value_type> il)
     {
         insert(il.begin(), il.end());
     }
@@ -751,7 +751,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         num_elements = 0;
     }
 
-    void shrink_to_fit()
+    inline void shrink_to_fit()
     {
         rehash_for_other_container(*this);
     }
@@ -766,7 +766,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
             swap(static_cast<EntryAlloc &>(*this), static_cast<EntryAlloc &>(other));
     }
 
-    size_t size() const
+    inline size_t size() const
     {
         return num_elements;
     }
@@ -800,12 +800,12 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         _max_load_factor = value;
 #endif
     }
-    float max_load_factor() const
+    inline float max_load_factor() const
     {
         return _max_load_factor;
     }
 
-    bool empty() const
+    inline bool empty() const
     {
         return num_elements == 0;
     }
@@ -818,9 +818,9 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
     float _max_load_factor = 0.5f;
 #endif
     size_t num_elements = 0;
-	int8_t max_lookups = detailv3::min_lookups - 1;
+    int8_t max_lookups = detailv3::min_lookups - 1;
 
-	static int8_t compute_max_lookups(size_t num_buckets)
+    static int8_t compute_max_lookups(size_t num_buckets)
     {
         int8_t desired = detailv3::log2(num_buckets);
         return std::max(detailv3::min_lookups, desired);
@@ -917,17 +917,17 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
     }
 
     template<typename U>
-    size_t hash_object(const U & key)
+    inline size_t hash_object(const U & key)
     {
         return static_cast<Hasher &>(*this)(key);
     }
     template<typename U>
-    size_t hash_object(const U & key) const
+    inline size_t hash_object(const U & key) const
     {
         return static_cast<const Hasher &>(*this)(key);
     }
     template<typename L, typename R>
-    bool compares_equal(const L & lhs, const R & rhs)
+    inline bool compares_equal(const L & lhs, const R & rhs)
     {
         return static_cast<Equal &>(*this)(lhs, rhs);
     }
@@ -1271,23 +1271,23 @@ struct prime_number_hash_policy
 
 struct power_of_two_hash_policy
 {
-    size_t index_for_hash(size_t hash, size_t num_slots_minus_one) const
+    inline size_t index_for_hash(size_t hash, size_t num_slots_minus_one) const
     {
         return hash & num_slots_minus_one;
     }
-    size_t keep_in_range(size_t index, size_t num_slots_minus_one) const
+    inline size_t keep_in_range(size_t index, size_t num_slots_minus_one) const
     {
         return index_for_hash(index, num_slots_minus_one);
     }
-    int8_t next_size_over(size_t & size) const
+    inline int8_t next_size_over(size_t & size) const
     {
         size = detailv3::next_power_of_two(size);
         return 0;
     }
-    void commit(int8_t)
+    inline void commit(int8_t)
     {
     }
-    void reset()
+    inline void reset()
     {
     }
 
@@ -1295,11 +1295,11 @@ struct power_of_two_hash_policy
 
 struct fibonacci_hash_policy
 {
-    size_t index_for_hash(size_t hash, size_t /*num_slots_minus_one*/) const
+    inline size_t index_for_hash(size_t hash, size_t /*num_slots_minus_one*/) const
     {
         return (11400714819323198485ull * hash) >> shift;
     }
-    size_t keep_in_range(size_t index, size_t num_slots_minus_one) const
+    inline size_t keep_in_range(size_t index, size_t num_slots_minus_one) const
     {
         return index & num_slots_minus_one;
     }
@@ -1309,11 +1309,11 @@ struct fibonacci_hash_policy
         size = std::max(size_t(2), detailv3::next_power_of_two(size));
         return 64 - detailv3::log2(size);
     }
-    void commit(int8_t shift)
+    inline void commit(int8_t shift)
     {
         this->shift = shift;
     }
-    void reset()
+    inline void reset()
     {
         shift = 63;
     }
@@ -1381,7 +1381,7 @@ class flat_hash_map
     }
 
     using Table::emplace;
-    std::pair<typename Table::iterator, bool> emplace()
+    inline std::pair<typename Table::iterator, bool> emplace()
     {
         return emplace(key_type(), convertible_to_value());
     }
@@ -1434,7 +1434,7 @@ class flat_hash_map
 private:
     struct convertible_to_value
     {
-        operator V() const
+        inline operator V() const
         {
             return V();
         }
@@ -1476,23 +1476,23 @@ class flat_hash_set
     }
 
     template<typename... Args>
-    std::pair<typename Table::iterator, bool> emplace(Args &&... args)
+    inline std::pair<typename Table::iterator, bool> emplace(Args &&... args)
     {
         return Table::emplace(T(std::forward<Args>(args)...));
     }
-    std::pair<typename Table::iterator, bool> emplace(const key_type & arg)
+    inline std::pair<typename Table::iterator, bool> emplace(const key_type & arg)
     {
         return Table::emplace(arg);
     }
-    std::pair<typename Table::iterator, bool> emplace(key_type & arg)
+    inline std::pair<typename Table::iterator, bool> emplace(key_type & arg)
     {
         return Table::emplace(arg);
     }
-    std::pair<typename Table::iterator, bool> emplace(const key_type && arg)
+    inline std::pair<typename Table::iterator, bool> emplace(const key_type && arg)
     {
         return Table::emplace(std::move(arg));
     }
-    std::pair<typename Table::iterator, bool> emplace(key_type && arg)
+    inline std::pair<typename Table::iterator, bool> emplace(key_type && arg)
     {
         return Table::emplace(std::move(arg));
     }
@@ -1508,7 +1508,7 @@ class flat_hash_set
         }
         return true;
     }
-    friend bool operator!=(const flat_hash_set & lhs, const flat_hash_set & rhs)
+    inline friend bool operator!=(const flat_hash_set & lhs, const flat_hash_set & rhs)
     {
         return !(lhs == rhs);
     }
diff --git a/src/Amalgam/ThreadPool.cpp b/src/Amalgam/ThreadPool.cpp
index d7d66a43..26407e3c 100644
--- a/src/Amalgam/ThreadPool.cpp
+++ b/src/Amalgam/ThreadPool.cpp
@@ -14,7 +14,8 @@ ThreadPool::ThreadPool(size_t max_num_threads)
 
 void ThreadPool::ChangeThreadPoolSize(size_t new_max_num_threads)
 {
-	std::unique_lock<std::mutex> lock(threadsMutex);
+	std::unique_lock<std::mutex> threads_lock(threadsMutex);
+	std::unique_lock<std::mutex> queue_lock(taskQueueMutex);
 
 	//don't need to change anything
 	if(new_max_num_threads == threads.size())
@@ -38,6 +39,11 @@ void ThreadPool::ChangeThreadPoolSize(size_t new_max_num_threads)
 		threads.emplace_back(
 			[this]
 		{
+			//count this thread as active during startup
+			//this is important, as the inner loop assumes the default state of the thread is to count itself
+			//so the number of threads doesn't change when switching between a completed task and a new one
+			numActiveThreads++;
+
 			//infinite loop waiting for work
 			for(;;)
 			{
@@ -51,6 +57,8 @@ void ThreadPool::ChangeThreadPoolSize(size_t new_max_num_threads)
 					//if no more work, wait until shutdown or more work
 					if(taskQueue.empty())
 					{
+						numActiveThreads--;
+
 						//wait until either shutting down or more work has been added
 						waitForTask.wait(lock,
 							[this] { return shutdownThreads || !taskQueue.empty(); });
@@ -58,19 +66,18 @@ void ThreadPool::ChangeThreadPoolSize(size_t new_max_num_threads)
 						//only can make it here if shutting down (otherwise taskQueue has something in it)
 						if(shutdownThreads)
 							return;
+
+						//got a task, resuming the thread
+						numActiveThreads++;
 					}					
 
 					//take ownership of the task so it can be destructed when complete
 					// (won't increment shared_ptr counter)
 					task = std::move(taskQueue.front());
 					taskQueue.pop();
-
-					//count the thread as active before releasing the lock
-					numActiveThreads++;
 				}
 
 				task();
-				numActiveThreads--;
 			}
 		}
 		);
@@ -78,7 +85,8 @@ void ThreadPool::ChangeThreadPoolSize(size_t new_max_num_threads)
 
 	//notify all just in case a new task was added as the threads were being created
 	// but unlock to allow threads to proceed
-	lock.unlock();
+	threads_lock.unlock();
+	queue_lock.unlock();
 	waitForTask.notify_all();
 }
 
diff --git a/src/Amalgam/ThreadPool.h b/src/Amalgam/ThreadPool.h
index eeec94ed..6d1da074 100644
--- a/src/Amalgam/ThreadPool.h
+++ b/src/Amalgam/ThreadPool.h
@@ -26,10 +26,19 @@ class ThreadPool
 		return numActiveThreads;
 	}
 
+	//returns the current maximum number of threads that are available
+	inline size_t GetCurrentMaxNumThreads()
+	{
+		std::unique_lock<std::mutex> lock(threadsMutex);
+		return threads.size();
+	}
+
 	//returns a vector of the thread ids for the thread pool
 	inline std::vector<std::thread::id> GetThreadIds()
 	{
 		std::vector<std::thread::id> thread_ids;
+
+		std::unique_lock<std::mutex> lock(threadsMutex);
 		thread_ids.reserve(threads.size() + 1);
 		thread_ids.push_back(mainThreadId);
 		for(std::thread &worker : threads)
@@ -147,7 +156,7 @@ class ThreadPool
 			//need to make sure there's at least one extra thread available to make sure that this batch of tasks can be run
 			// in case there are any interdependencies, in order to prevent deadlock
 			if(taskQueue.size() + numActiveThreads >= threads.size())
-				btel.MarkAsNoThreadsAvailable();			
+				btel.MarkAsNoThreadsAvailable();
 		}
 
 		return btel;
diff --git a/src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp b/src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp
index 104b91ab..e9253ee0 100644
--- a/src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp
+++ b/src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp
@@ -13,13 +13,13 @@ Concurrency::ReadWriteMutex EvaluableNodeManager::memoryModificationMutex;
 #endif
 
 const double EvaluableNodeManager::allocExpansionFactor = 1.5;
-const ExecutionCycleCountCompactDelta EvaluableNodeManager::minCycleCountBetweenGarbageCollects = 150000;
-
-EvaluableNodeManager::EvaluableNodeManager()
-{
-	firstUnusedNodeIndex = 0;
-	executionCyclesSinceLastGarbageCollection = 0;
-}
+#ifdef MULTITHREAD_SUPPORT
+const ExecutionCycleCountCompactDelta EvaluableNodeManager::minCycleCountBetweenGarbageCollectsPerThread = 150000;
+#else
+//make the next value constant if no threads
+const
+#endif
+ExecutionCycleCountCompactDelta EvaluableNodeManager::minCycleCountBetweenGarbageCollects = 150000;
 
 EvaluableNodeManager::~EvaluableNodeManager()
 {
@@ -163,34 +163,6 @@ EvaluableNode *EvaluableNodeManager::AllocListNodeWithOrderedChildNodes(Evaluabl
 	return retval;
 }
 
-bool EvaluableNodeManager::RecommendGarbageCollection()
-{
-	//makes sure to perform garbage collection between every opcode to find memory reference errors
-#ifdef PEDANTIC_GARBAGE_COLLECTION
-	return true;
-#endif
-
-#ifdef MULTITHREAD_SUPPORT
-	if(executionCyclesSinceLastGarbageCollection > minCycleCountBetweenGarbageCollects * static_cast<ExecutionCycleCount>(Concurrency::threadPool.GetNumActiveThreads()))
-#else
-	if(executionCyclesSinceLastGarbageCollection > minCycleCountBetweenGarbageCollects)
-#endif
-	{
-		auto cur_size = GetNumberOfUsedNodes();
-
-		size_t next_expansion_size = static_cast<size_t>(cur_size * allocExpansionFactor);
-		if(next_expansion_size < nodes.size())
-		{
-			executionCyclesSinceLastGarbageCollection = 0;
-			return false;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
 #ifdef MULTITHREAD_SUPPORT
 void EvaluableNodeManager::CollectGarbage(Concurrency::ReadLock *memory_modification_lock)
 #else
diff --git a/src/Amalgam/evaluablenode/EvaluableNodeManagement.h b/src/Amalgam/evaluablenode/EvaluableNodeManagement.h
index 58ec462f..02704937 100644
--- a/src/Amalgam/evaluablenode/EvaluableNodeManagement.h
+++ b/src/Amalgam/evaluablenode/EvaluableNodeManagement.h
@@ -124,7 +124,9 @@ class EvaluableNodeStackStateSaver
 class EvaluableNodeManager
 {
 public:
-	EvaluableNodeManager();
+	EvaluableNodeManager() :
+		executionCyclesSinceLastGarbageCollection(0), firstUnusedNodeIndex(0)
+	{	}
 
 	~EvaluableNodeManager();
 
@@ -329,7 +331,30 @@ class EvaluableNodeManager
 	}
 
 	//heuristic used to determine whether unused memory should be collected (e.g., by FreeAllNodesExcept*)
-	bool RecommendGarbageCollection();
+	//force this inline because it occurs in inner loops
+	__forceinline bool RecommendGarbageCollection()
+	{
+		//makes sure to perform garbage collection between every opcode to find memory reference errors
+	#ifdef PEDANTIC_GARBAGE_COLLECTION
+		return true;
+	#endif
+
+		if(executionCyclesSinceLastGarbageCollection > minCycleCountBetweenGarbageCollects)
+		{
+			auto cur_size = GetNumberOfUsedNodes();
+
+			size_t next_expansion_size = static_cast<size_t>(cur_size * allocExpansionFactor);
+			if(next_expansion_size < nodes.size())
+			{
+				executionCyclesSinceLastGarbageCollection = 0;
+				return false;
+			}
+
+			return true;
+		}
+
+		return false;
+	}
 
 	//moves garbage collection to be more likely to be triggered next time CollectGarbage is called
 	__forceinline void AdvanceGarbageCollectionTrigger()
@@ -672,8 +697,21 @@ class EvaluableNodeManager
 
 	static void ValidateEvaluableNodeTreeMemoryIntegrityRecurse(EvaluableNode *en, EvaluableNode::ReferenceSetType &checked);
 
-#ifdef MULTITHREAD_SUPPORT	
+#ifdef MULTITHREAD_SUPPORT
 public:
+
+	//updates garbage collection process based on current number of threads and number of tasks
+	static inline void UpdateMinCycleCountBetweenGarbageCollectsBasedOnThreads(size_t num_tasks)
+	{
+		//can't go above the max number of threads
+		num_tasks = std::min(num_tasks, Concurrency::threadPool.GetCurrentMaxNumThreads());
+		//don't want to go below the number of threads being used by other things
+		num_tasks = std::max(num_tasks, Concurrency::threadPool.GetNumActiveThreads());
+
+		minCycleCountBetweenGarbageCollects = minCycleCountBetweenGarbageCollectsPerThread
+			* static_cast<ExecutionCycleCountCompactDelta>(num_tasks);
+	}
+
 	//mutex to manage attributes of manager, including operations such as
 	// memory allocation, reference management, etc.
 	Concurrency::ReadWriteMutex managerAttributesMutex;
@@ -707,6 +745,13 @@ class EvaluableNodeManager
 	//extra space to allocate when allocating
 	static const double allocExpansionFactor;
 
-	//minimum number of cycles between collects as to not spend too much time garbage collecting
-	static const ExecutionCycleCountCompactDelta minCycleCountBetweenGarbageCollects;
+#ifdef MULTITHREAD_SUPPORT
+	//minimum number of cycles between collects per thread
+	static const ExecutionCycleCountCompactDelta minCycleCountBetweenGarbageCollectsPerThread;
+#else
+	//make the next value constant if no threads
+	const
+#endif
+	//current number of cycles between collects based on number of threads
+	static ExecutionCycleCountCompactDelta minCycleCountBetweenGarbageCollects;
 };
diff --git a/src/Amalgam/interpreter/Interpreter.cpp b/src/Amalgam/interpreter/Interpreter.cpp
index c02a2a8c..4f5d7263 100644
--- a/src/Amalgam/interpreter/Interpreter.cpp
+++ b/src/Amalgam/interpreter/Interpreter.cpp
@@ -775,21 +775,21 @@ bool Interpreter::InterpretEvaluableNodesConcurrently(EvaluableNode *parent_node
 	if(!parent_node->GetConcurrency())
 		return false;
 	
-	size_t num_elements = nodes.size();
-	if(num_elements < 2)
+	size_t num_tasks = nodes.size();
+	if(num_tasks < 2)
 		return false;
 
 	auto enqueue_task_lock = Concurrency::threadPool.BeginEnqueueBatchTask();
 	if(!enqueue_task_lock.AreThreadsAvailable())
 		return false;
 
-	ConcurrencyManager concurrency_manager(this, num_elements);
+	ConcurrencyManager concurrency_manager(this, num_tasks);
 
 	//kick off interpreters
-	for(size_t element_index = 0; element_index < num_elements; element_index++)
+	for(size_t task_index = 0; task_index < num_tasks; task_index++)
 	{
-		auto &interpreter = *concurrency_manager.interpreters[element_index];
-		EvaluableNode *node_to_execute = nodes[element_index];
+		auto &interpreter = *concurrency_manager.interpreters[task_index];
+		EvaluableNode *node_to_execute = nodes[task_index];
 
 		concurrency_manager.resultFutures.emplace_back(
 			Concurrency::threadPool.EnqueueBatchTask(
diff --git a/src/Amalgam/interpreter/Interpreter.h b/src/Amalgam/interpreter/Interpreter.h
index 5347f834..30dc3b1f 100644
--- a/src/Amalgam/interpreter/Interpreter.h
+++ b/src/Amalgam/interpreter/Interpreter.h
@@ -394,18 +394,18 @@ class Interpreter
 	public:
 
 		//constructs the concurrency manager.  Assumes parent_interpreter is NOT null
-		ConcurrencyManager(Interpreter *parent_interpreter, size_t num_elements)
+		ConcurrencyManager(Interpreter *parent_interpreter, size_t num_tasks)
 		{
 			parentInterpreter = parent_interpreter;
-			numElements = num_elements;
+			numTasks = num_tasks;
 
 			//set up data
-			interpreters.reserve(numElements);
-			resultFutures.reserve(numElements);
+			interpreters.reserve(numTasks);
+			resultFutures.reserve(numTasks);
 
 			size_t max_execution_steps_per_element = 0;
 			if(parentInterpreter->maxNumExecutionSteps > 0)
-				max_execution_steps_per_element = (parentInterpreter->maxNumExecutionSteps - parentInterpreter->GetNumStepsExecuted()) / numElements;
+				max_execution_steps_per_element = (parentInterpreter->maxNumExecutionSteps - parentInterpreter->GetNumStepsExecuted()) / numTasks;
 
 			//since each thread has a copy of the constructionStackNodes, it's possible that more than one of the threads
 			//obtains previous_results, so they must all be marked as not unique
@@ -413,7 +413,7 @@ class Interpreter
 
 			//set up all the interpreters
 			// do this as its own loop to make sure that the vector memory isn't reallocated once the threads have kicked off
-			for(size_t element_index = 0; element_index < numElements; element_index++)
+			for(size_t element_index = 0; element_index < numTasks; element_index++)
 			{
 				//create interpreter
 				interpreters.emplace_back(std::make_unique<Interpreter>(parentInterpreter->evaluableNodeManager, max_execution_steps_per_element, parentInterpreter->maxNumExecutionNodes,
@@ -421,6 +421,8 @@ class Interpreter
 					parentInterpreter->writeListeners, parentInterpreter->printListener, parentInterpreter->curEntity));
 			}
 
+			EvaluableNodeManager::UpdateMinCycleCountBetweenGarbageCollectsBasedOnThreads(num_tasks);
+
 			//begins concurrency over all interpreters
 			parentInterpreter->memoryModificationLock.unlock();
 		}
@@ -482,6 +484,9 @@ class Interpreter
 
 			Concurrency::threadPool.CountCurrentThreadAsResumed();
 
+			//merged back to one task (this method will attempt to account for other concurrency)
+			EvaluableNodeManager::UpdateMinCycleCountBetweenGarbageCollectsBasedOnThreads(1);
+
 			parentInterpreter->memoryModificationLock.lock();
 		}
 
@@ -491,11 +496,11 @@ class Interpreter
 		inline std::vector<EvaluableNodeReference> GetResultsAndFreeReferences()
 		{
 			std::vector<EvaluableNodeReference> results;
-			results.resize(numElements);
+			results.resize(numTasks);
 
 			//fill in results from result_futures and free references
 			// note that std::future becomes invalid once get is called
-			for(size_t i = 0; i < numElements; i++)
+			for(size_t i = 0; i < numTasks; i++)
 				results[i] = resultFutures[i].get();
 
 			parentInterpreter->evaluableNodeManager->FreeNodeReferences(results);
@@ -514,10 +519,10 @@ class Interpreter
 			return &callStackWriteMutex;
 		}
 
-		//interpreters run concurrently, the size of numElements
+		//interpreters run concurrently, the size of numTasks
 		std::vector<std::unique_ptr<Interpreter>> interpreters;
 
-		//where results are placed, the size of numElements
+		//where results are placed, the size of numTasks
 		std::vector<std::future<EvaluableNodeReference>> resultFutures;
 
 		//mutex to allow only one thread to write to a call stack symbol at once
@@ -528,7 +533,7 @@ class Interpreter
 		Interpreter *parentInterpreter;
 
 		//the number of elements being processed
-		size_t numElements;
+		size_t numTasks;
 	};
 
 	//computes the nodes concurrently and stores the interpreted values into interpreted_nodes