Skip to content

Commit

Permalink
[release/9.0-staging] DATAS BGC thread synchronization fix
Browse files Browse the repository at this point in the history
Backport of #109804 to release/9.0-staging

/cc @mangod9 @mrsharm

Customer Impact
 Customer reported
 Found internally
Original issue: #109804. Customers discovered a hang because one of the BGC threads had disappeared which shouldn't have happened - this is due to a BGC thread that was not needed during a previous BGC (because DATAS only needed a subset of BGC threads for that BGC to run) ran at a time when settings.concurrent was FALSE so it exited.

Regression
 Yes
 No
Testing
I have added stress code in GC (under STRESS_DYNAMIC_HEAP_COUNT) to make the repro quite easy.

Risk
Low. This has gone through extensive testing on both Windows and Linux.
  • Loading branch information
Maoni0 authored Nov 27, 2024
1 parent d955059 commit 6124867
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 14 deletions.
173 changes: 159 additions & 14 deletions src/coreclr/gc/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2890,10 +2890,14 @@ bool gc_heap::trigger_initial_gen2_p = false;

#ifdef BACKGROUND_GC
bool gc_heap::trigger_bgc_for_rethreading_p = false;
int gc_heap::total_bgc_threads = 0;
int gc_heap::last_bgc_n_heaps = 0;
int gc_heap::last_total_bgc_threads = 0;
#endif //BACKGROUND_GC

#ifdef STRESS_DYNAMIC_HEAP_COUNT
int gc_heap::heaps_in_this_gc = 0;
int gc_heap::bgc_to_ngc2_ratio = 0;
#endif //STRESS_DYNAMIC_HEAP_COUNT
#endif // DYNAMIC_HEAP_COUNT

Expand Down Expand Up @@ -14190,6 +14194,11 @@ HRESULT gc_heap::initialize_gc (size_t soh_segment_size,

if ((dynamic_adaptation_mode == dynamic_adaptation_to_application_sizes) && (conserve_mem_setting == 0))
conserve_mem_setting = 5;

#ifdef STRESS_DYNAMIC_HEAP_COUNT
bgc_to_ngc2_ratio = (int)GCConfig::GetGCDBGCRatio();
dprintf (1, ("bgc_to_ngc2_ratio is %d", bgc_to_ngc2_ratio));
#endif
#endif //DYNAMIC_HEAP_COUNT

if (conserve_mem_setting < 0)
Expand Down Expand Up @@ -21079,6 +21088,18 @@ int gc_heap::joined_generation_to_condemn (BOOL should_evaluate_elevation,
if (!((n == max_generation) && *blocking_collection_p))
{
n = max_generation;

#ifdef STRESS_DYNAMIC_HEAP_COUNT
if (bgc_to_ngc2_ratio)
{
int r = (int)gc_rand::get_rand ((bgc_to_ngc2_ratio + 1) * 10);
dprintf (6666, ("%d - making this full GC %s", r, ((r < 10) ? "NGC2" : "BGC")));
if (r < 10)
{
*blocking_collection_p = TRUE;
}
}
#endif //STRESS_DYNAMIC_HEAP_COUNT
}
}
}
Expand Down Expand Up @@ -24340,12 +24361,37 @@ void gc_heap::garbage_collect (int n)
size_t saved_bgc_th_count_creation_failed = bgc_th_count_creation_failed;
#endif //DYNAMIC_HEAP_COUNT

// This is the count of threads that GCToEEInterface::CreateThread reported successful for.
int total_bgc_threads_running = 0;
for (int i = 0; i < n_heaps; i++)
{
prepare_bgc_thread (g_heaps[i]);
gc_heap* hp = g_heaps[i];
if (prepare_bgc_thread (hp))
{
assert (hp->bgc_thread_running);
if (!hp->bgc_thread_running)
{
dprintf (6666, ("h%d prepare succeeded but running is still false!", i));
GCToOSInterface::DebugBreak();
}
total_bgc_threads_running++;
}
else
{
break;
}
}

#ifdef DYNAMIC_HEAP_COUNT
// Even if we don't do a BGC, we need to record how many threads were successfully created because those will
// be running.
total_bgc_threads = max (total_bgc_threads, total_bgc_threads_running);

if (total_bgc_threads_running != n_heaps)
{
dprintf (6666, ("wanted to have %d BGC threads but only have %d", n_heaps, total_bgc_threads_running));
}

add_to_bgc_th_creation_history (current_gc_index,
(bgc_th_count_created - saved_bgc_th_count_created),
(bgc_th_count_created_th_existed - saved_bgc_th_count_created_th_existed),
Expand Down Expand Up @@ -24377,7 +24423,15 @@ void gc_heap::garbage_collect (int n)
for (int i = 0; i < n_heaps; i++)
{
gc_heap* hp = g_heaps[i];
if (!(hp->bgc_thread) || !hp->commit_mark_array_bgc_init())

if (!(hp->bgc_thread_running))
{
assert (!(hp->bgc_thread));
}

// In theory we could be in a situation where bgc_thread_running is false but bgc_thread is non NULL. We don't
// support this scenario so don't do a BGC.
if (!(hp->bgc_thread_running && hp->bgc_thread && hp->commit_mark_array_bgc_init()))
{
do_concurrent_p = FALSE;
break;
Expand All @@ -24397,8 +24451,37 @@ void gc_heap::garbage_collect (int n)
}
#endif //MULTIPLE_HEAPS

#ifdef DYNAMIC_HEAP_COUNT
dprintf (6666, ("last BGC saw %d heaps and %d total threads, currently %d heaps and %d total threads, %s BGC",
last_bgc_n_heaps, last_total_bgc_threads, n_heaps, total_bgc_threads, (do_concurrent_p ? "doing" : "not doing")));
#endif //DYNAMIC_HEAP_COUNT

if (do_concurrent_p)
{
#ifdef DYNAMIC_HEAP_COUNT
int diff = n_heaps - last_bgc_n_heaps;
if (diff > 0)
{
int saved_idle_bgc_thread_count = dynamic_heap_count_data.idle_bgc_thread_count;
int max_idle_event_count = min (n_heaps, last_total_bgc_threads);
int idle_events_to_set = max_idle_event_count - last_bgc_n_heaps;
if (idle_events_to_set > 0)
{
Interlocked::ExchangeAdd (&dynamic_heap_count_data.idle_bgc_thread_count, -idle_events_to_set);
dprintf (6666, ("%d BGC threads exist, setting %d idle events for h%d-h%d, total idle %d -> %d",
total_bgc_threads, idle_events_to_set, last_bgc_n_heaps, (last_bgc_n_heaps + idle_events_to_set - 1),
saved_idle_bgc_thread_count, VolatileLoadWithoutBarrier (&dynamic_heap_count_data.idle_bgc_thread_count)));
for (int heap_idx = last_bgc_n_heaps; heap_idx < max_idle_event_count; heap_idx++)
{
g_heaps[heap_idx]->bgc_idle_thread_event.Set();
}
}
}

last_bgc_n_heaps = n_heaps;
last_total_bgc_threads = total_bgc_threads;
#endif //DYNAMIC_HEAP_COUNT

#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
SoftwareWriteWatch::EnableForGCHeap();
#endif //FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
Expand Down Expand Up @@ -25926,9 +26009,6 @@ void gc_heap::check_heap_count ()
for (int heap_idx = n_heaps; heap_idx < new_n_heaps; heap_idx++)
{
g_heaps[heap_idx]->gc_idle_thread_event.Set();
#ifdef BACKGROUND_GC
g_heaps[heap_idx]->bgc_idle_thread_event.Set();
#endif //BACKGROUND_GC
}
}

Expand Down Expand Up @@ -37645,6 +37725,19 @@ void gc_heap::gc_thread_stub (void* arg)
void gc_heap::bgc_thread_stub (void* arg)
{
gc_heap* heap = (gc_heap*)arg;

#ifdef STRESS_DYNAMIC_HEAP_COUNT
// We should only do this every so often; otherwise we'll never be able to do a BGC
int r = (int)gc_rand::get_rand (30);
bool wait_p = (r < 10);

if (wait_p)
{
GCToOSInterface::Sleep (100);
}
dprintf (6666, ("h%d %s", heap->heap_number, (wait_p ? "waited" : "did not wait")));
#endif

heap->bgc_thread = GCToEEInterface::GetThread();
assert(heap->bgc_thread != nullptr);
heap->bgc_thread_function();
Expand Down Expand Up @@ -39437,6 +39530,8 @@ void gc_heap::add_to_bgc_th_creation_history (size_t gc_index, size_t count_crea
}
#endif //DYNAMIC_HEAP_COUNT

// If this returns TRUE, we are saying we expect that thread to be there. However, when that thread is available to work is indeterministic.
// But when we actually start a BGC, naturally we'll need to wait till it gets to the point it can work.
BOOL gc_heap::prepare_bgc_thread(gc_heap* gh)
{
BOOL success = FALSE;
Expand All @@ -39448,7 +39543,19 @@ BOOL gc_heap::prepare_bgc_thread(gc_heap* gh)
dprintf (2, ("GC thread not running"));
if (gh->bgc_thread == 0)
{
#ifdef STRESS_DYNAMIC_HEAP_COUNT
// to stress, we just don't actually try to create the thread to simulate a failure
int r = (int)gc_rand::get_rand (100);
bool try_to_create_p = (r > 10);
BOOL thread_created_p = (try_to_create_p ? create_bgc_thread (gh) : FALSE);
if (!thread_created_p)
{
dprintf (6666, ("h%d we failed to create the thread, %s", gh->heap_number, (try_to_create_p ? "tried" : "didn't try")));
}
if (thread_created_p)
#else //STRESS_DYNAMIC_HEAP_COUNT
if (create_bgc_thread(gh))
#endif //STRESS_DYNAMIC_HEAP_COUNT
{
success = TRUE;
thread_created = TRUE;
Expand All @@ -39466,8 +39573,11 @@ BOOL gc_heap::prepare_bgc_thread(gc_heap* gh)
else
{
#ifdef DYNAMIC_HEAP_COUNT
// This would be a very unusual scenario where GCToEEInterface::CreateThread told us it failed yet the thread was created.
bgc_th_count_created_th_existed++;
dprintf (6666, ("h%d we cannot have a thread that runs yet CreateThread reported it failed to create it", gh->heap_number));
#endif //DYNAMIC_HEAP_COUNT
assert (!"GCToEEInterface::CreateThread returned FALSE yet the thread was created!");
}
}
else
Expand Down Expand Up @@ -39665,7 +39775,7 @@ void gc_heap::bgc_thread_function()
while (1)
{
// Wait for work to do...
dprintf (3, ("bgc thread: waiting..."));
dprintf (6666, ("h%d bgc thread: waiting...", heap_number));

cooperative_mode = enable_preemptive ();
//current_thread->m_fPreemptiveGCDisabled = 0;
Expand Down Expand Up @@ -39714,36 +39824,71 @@ void gc_heap::bgc_thread_function()
continue;
}
}

#ifdef STRESS_DYNAMIC_HEAP_COUNT
if (n_heaps <= heap_number)
{
uint32_t delay_ms = (uint32_t)gc_rand::get_rand (200);
GCToOSInterface::Sleep (delay_ms);
}
#endif //STRESS_DYNAMIC_HEAP_COUNT

// if we signal the thread with no concurrent work to do -> exit
if (!settings.concurrent)
{
dprintf (3, ("no concurrent GC needed, exiting"));
dprintf (6666, ("h%d no concurrent GC needed, exiting", heap_number));

#ifdef STRESS_DYNAMIC_HEAP_COUNT
flush_gc_log (true);
GCToOSInterface::DebugBreak();
#endif
break;
}
gc_background_running = TRUE;
dprintf (2, (ThreadStressLog::gcStartBgcThread(), heap_number,
generation_free_list_space (generation_of (max_generation)),
generation_free_obj_space (generation_of (max_generation)),
dd_fragmentation (dynamic_data_of (max_generation))));

#ifdef DYNAMIC_HEAP_COUNT
if (n_heaps <= heap_number)
{
Interlocked::Increment (&dynamic_heap_count_data.idle_bgc_thread_count);
add_to_bgc_hc_history (hc_record_bgc_inactive);

// this is the case where we have more background GC threads than heaps
// - wait until we're told to continue...
dprintf (9999, ("BGC thread %d idle (%d heaps) (gc%Id)", heap_number, n_heaps, VolatileLoadWithoutBarrier (&settings.gc_index)));
dprintf (6666, ("BGC%Id h%d going idle (%d heaps), idle count is now %d",
VolatileLoadWithoutBarrier (&settings.gc_index), heap_number, n_heaps, VolatileLoadWithoutBarrier (&dynamic_heap_count_data.idle_bgc_thread_count)));
bgc_idle_thread_event.Wait(INFINITE, FALSE);
dprintf (9999, ("BGC thread %d waking from idle (%d heaps) (gc%Id)", heap_number, n_heaps, VolatileLoadWithoutBarrier (&settings.gc_index)));
dprintf (6666, ("BGC%Id h%d woke from idle (%d heaps), idle count is now %d",
VolatileLoadWithoutBarrier (&settings.gc_index), heap_number, n_heaps, VolatileLoadWithoutBarrier (&dynamic_heap_count_data.idle_bgc_thread_count)));
continue;
}
else
{
if (heap_number == 0)
{
const int spin_count = 1024;
int idle_bgc_thread_count = total_bgc_threads - n_heaps;
dprintf (6666, ("n_heaps %d, total %d bgc threads, bgc idle should be %d and is %d",
n_heaps, total_bgc_threads, idle_bgc_thread_count, VolatileLoadWithoutBarrier (&dynamic_heap_count_data.idle_bgc_thread_count)));
if (idle_bgc_thread_count != dynamic_heap_count_data.idle_bgc_thread_count)
{
dprintf (6666, ("current idle is %d, trying to get to %d",
VolatileLoadWithoutBarrier (&dynamic_heap_count_data.idle_bgc_thread_count), idle_bgc_thread_count));
spin_and_wait (spin_count, (idle_bgc_thread_count == dynamic_heap_count_data.idle_bgc_thread_count));
}
}

add_to_bgc_hc_history (hc_record_bgc_active);
}
#endif //DYNAMIC_HEAP_COUNT

if (heap_number == 0)
{
gc_background_running = TRUE;
dprintf (6666, (ThreadStressLog::gcStartBgcThread(), heap_number,
generation_free_list_space (generation_of (max_generation)),
generation_free_obj_space (generation_of (max_generation)),
dd_fragmentation (dynamic_data_of (max_generation))));
}

gc1();

#ifndef DOUBLY_LINKED_FL
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/gc/gcconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class GCConfigStringHolder
INT_CONFIG (GCSpinCountUnit, "GCSpinCountUnit", NULL, 0, "Specifies the spin count unit used by the GC.") \
INT_CONFIG (GCDynamicAdaptationMode, "GCDynamicAdaptationMode", "System.GC.DynamicAdaptationMode", 1, "Enable the GC to dynamically adapt to application sizes.") \
INT_CONFIG (GCDTargetTCP, "GCDTargetTCP", "System.GC.DTargetTCP", 0, "Specifies the target tcp for DATAS") \
INT_CONFIG (GCDBGCRatio, " GCDBGCRatio", NULL, 0, "Specifies the ratio of BGC to NGC2 for HC change") \
BOOL_CONFIG (GCLogBGCThreadId, "GCLogBGCThreadId", NULL, false, "Specifies if BGC ThreadId should be logged") \
BOOL_CONFIG (GCCacheSizeFromSysConf, "GCCacheSizeFromSysConf", NULL, false, "Specifies using sysconf to retrieve the last level cache size for Unix.")

Expand Down
17 changes: 17 additions & 0 deletions src/coreclr/gc/gcpriv.h
Original file line number Diff line number Diff line change
Expand Up @@ -5175,6 +5175,9 @@ class gc_heap
int last_n_heaps;
// don't start a GC till we see (n_max_heaps - new_n_heaps) number of threads idling
VOLATILE(int32_t) idle_thread_count;
#ifdef BACKGROUND_GC
VOLATILE(int32_t) idle_bgc_thread_count;
#endif
bool init_only_p;

bool should_change_heap_count;
Expand Down Expand Up @@ -5202,6 +5205,17 @@ class gc_heap
// This is set when change_heap_count wants the next GC to be a BGC for rethreading gen2 FL
// and reset during that BGC.
PER_HEAP_ISOLATED_FIELD_MAINTAINED bool trigger_bgc_for_rethreading_p;
// BGC threads are created on demand but we don't destroy the ones we created. This
// is to track how many we've created. They may or may not be active depending on
// if they are needed.
PER_HEAP_ISOLATED_FIELD_MAINTAINED int total_bgc_threads;

// HC last BGC observed.
PER_HEAP_ISOLATED_FIELD_MAINTAINED int last_bgc_n_heaps;
// Number of total BGC threads last BGC observed. This tells us how many new BGC threads have
// been created since. Note that just because a BGC thread is created doesn't mean it's used.
// We can fail at committing mark array and not proceed with the BGC.
PER_HEAP_ISOLATED_FIELD_MAINTAINED int last_total_bgc_threads;
#endif //BACKGROUND_GC
#endif //DYNAMIC_HEAP_COUNT

Expand Down Expand Up @@ -5352,6 +5366,9 @@ class gc_heap

#ifdef DYNAMIC_HEAP_COUNT
PER_HEAP_ISOLATED_FIELD_INIT_ONLY int dynamic_adaptation_mode;
#ifdef STRESS_DYNAMIC_HEAP_COUNT
PER_HEAP_ISOLATED_FIELD_INIT_ONLY int bgc_to_ngc2_ratio;
#endif //STRESS_DYNAMIC_HEAP_COUNT
#endif //DYNAMIC_HEAP_COUNT

/********************************************/
Expand Down

0 comments on commit 6124867

Please sign in to comment.