[Gc] PARALLEL_MARK for Cygwin and for Win32 now works

Ivan Maidanski ivmai at mail.ru
Wed Oct 22 16:12:37 PDT 2008


Hi!

I've ported the parallel marking algorithm to Win threads and Cygwin pthreads. It works with/without USE_MMAP (Cygwin only), USE_MUNMAP (Win32 only), THREAD_LOCAL_ALLOC. It is not compatible with GC_use_DllMain(). Tested with cygwin, mingw, VC++ (the latter reqires -DAO_ASSUME_WINDOWS98). Should work on Win64 but not tested yet. Lock spinning is implemented for Win threads case (turned on by MARKLOCK_USES_SPIN). Locking statistic is available if compiled with LOCK_STATS. "GC_MARKERS" environment var is recognized.

First usage of PARALLEL_MARK shows decrease of "world-stop" delay time up to 4 times in average (in non-incremental mode) on my Core2Duo.

test.c sometimes aborts with "Unexpected heap growth" - it seems to be ok (test.c should be fixed).

In cygwin GC sometimes dead-locks (in pthread_cond_wait()).

I have more problems with and questions regarding PARALLEL_MARK... But for now, that's all.

Bye.

-------------- next part --------------
diff -ru bdwgc/include/private/gcconfig.h updated/bdwgc/include/private/gcconfig.h
--- bdwgc/include/private/gcconfig.h	2008-10-19 23:09:32.000000000 +0400
+++ updated/bdwgc/include/private/gcconfig.h	2008-10-21 15:00:10.000000000 +0400
@@ -2110,7 +2110,8 @@
 #   undef MPROTECT_VDB  /* Can't deal with address space holes. */
 # endif
 
-# ifdef PARALLEL_MARK
+# if defined(PARALLEL_MARK)
+    /* FIXME: Should we undef it even in case of GWW_VDB? */
 #   undef MPROTECT_VDB  /* For now.	*/
 # endif
 
diff -ru bdwgc/win32_threads.c updated/bdwgc/win32_threads.c
--- bdwgc/win32_threads.c	2008-10-16 19:24:41.000000000 +0400
+++ updated/bdwgc/win32_threads.c	2008-10-22 22:24:36.000000000 +0400
@@ -1,3 +1,16 @@
+/* 
+ * Copyright here...
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ */
+
 #include "private/gc_priv.h"
 
 #if defined(GC_WIN32_THREADS)
@@ -254,8 +267,10 @@
 /* And now the version used if GC_win32_dll_threads is not set.	*/
 /* This is a chained hash table, with much of the code borrowed	*/
 /* From the Posix implementation.				*/
+#ifndef THREAD_TABLE_SZ
 # define THREAD_TABLE_SZ 256	/* Must be power of 2	*/
-  GC_thread GC_threads[THREAD_TABLE_SZ];
+#endif
+  STATIC GC_thread GC_threads[THREAD_TABLE_SZ];
   
 
 /* Add a thread to GC_threads.  We assume it wasn't already there.	*/
@@ -541,7 +556,7 @@
     GC_thread t = GC_lookup_thread_inner(id);
 
     if (0 == t) {
-      WARN("Removing nonexistent thread %ld\n", (GC_word)id);
+      WARN("Removing nonexistent thread %ld\n", (long)id);
     } else {
       GC_delete_gc_thread(t);
     }
@@ -731,10 +746,18 @@
 {
   DWORD thread_id = GetCurrentThreadId();
   int i;
+  int my_max;
 
   if (!GC_thr_initialized) ABORT("GC_stop_world() called before GC_thr_init()");
   GC_ASSERT(I_HOLD_LOCK());
 
+  /* This code is the same as in pthread_stop_world.c */
+# ifdef PARALLEL_MARK
+    GC_acquire_mark_lock();
+    GC_ASSERT(GC_fl_builder_count == 0);
+    /* We should have previously waited for it to become zero. */
+# endif /* PARALLEL_MARK */
+
   GC_please_stop = TRUE;
 # ifndef CYGWIN32
     EnterCriticalSection(&GC_write_cs);
@@ -745,7 +768,8 @@
     /* restart.								*/
     /* This is not ideal, but hopefully correct.			*/
     GC_attached_thread = FALSE;
-    for (i = 0; i <= GC_get_max_thread_index(); i++) {
+    my_max = (int)GC_get_max_thread_index();
+    for (i = 0; i <= my_max; i++) {
       GC_vthread t = dll_thread_table + i;
       if (t -> stack_base != 0
 	  && t -> id != thread_id) {
@@ -769,6 +793,9 @@
 # ifndef CYGWIN32
     LeaveCriticalSection(&GC_write_cs);
 # endif    
+# ifdef PARALLEL_MARK
+    GC_release_mark_lock();
+# endif
 }
 
 void GC_start_world(void)
@@ -1029,6 +1056,9 @@
 	  }
         }
       }
+#     ifdef PARALLEL_MARK
+	/* FIXME: Should we scan marker threads here too? */
+#     endif
     }
     *hi = current_min;
     if (current_min == ADDR_LIMIT) {
@@ -1066,6 +1096,380 @@
     if (*lo < start) *lo = start;
 }
 
+#ifdef PARALLEL_MARK
+
+# ifndef MAX_MARKERS
+#   define MAX_MARKERS 16
+# endif
+
+  /* GC_mark_thread() is the same as in pthread_support.c	*/
+  /* except for unused marker_[b]sp.				*/
+#ifdef GC_PTHREADS
+  STATIC void * GC_mark_thread(void * id)
+#else
+  STATIC unsigned __stdcall GC_mark_thread(void * id)
+#endif
+{
+  word my_mark_no = 0;
+
+  if ((word)id == (word)-1) return 0; /* to make compiler happy */
+
+  for (;; ++my_mark_no) {
+    if (my_mark_no - GC_mark_no > (word)2) {
+	/* resynchronize if we get far off, e.g. because GC_mark_no	*/
+	/* wrapped.							*/
+	my_mark_no = GC_mark_no;
+    }
+#   ifdef DEBUG_THREADS
+	GC_printf("Starting mark helper for mark number %lu\n",
+		(unsigned long)my_mark_no);
+#   endif
+    GC_help_marker(my_mark_no);
+  }
+}
+
+extern long GC_markers;		/* Number of mark threads we would	*/
+				/* like to have.  Includes the 		*/
+				/* initiating thread.			*/
+
+#ifdef GC_ASSERTIONS
+  unsigned long GC_mark_lock_holder = NO_THREAD;
+#endif
+
+/* GC_mark_threads[] is unused here unlike that in pthread_support.c */
+
+#ifdef GC_PTHREADS
+
+/* start_mark_threads() is the same as in pthread_support.c except for:	*/
+/* - GC_markers value is adjusted already;				*/
+/* - thread stack is assumed to be large enough; and			*/
+/* - statistics about the number of marker threads is already printed.	*/
+
+STATIC void start_mark_threads(void)
+{
+    unsigned i;
+    pthread_attr_t attr;
+    pthread_t new_thread;
+
+    if (0 != pthread_attr_init(&attr)) ABORT("pthread_attr_init failed");
+	
+    if (0 != pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
+	ABORT("pthread_attr_setdetachstate failed");
+
+    for (i = 0; i < GC_markers - 1; ++i) {
+      if (0 != pthread_create(&new_thread, &attr,
+			      GC_mark_thread, (void *)(word)i)) {
+	WARN("Marker thread creation failed, errno = %d.\n", errno);
+      }
+    }
+    pthread_attr_destroy(&attr);
+}
+
+STATIC pthread_mutex_t mark_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+STATIC pthread_cond_t builder_cv = PTHREAD_COND_INITIALIZER;
+
+/* GC_acquire/release_mark_lock(), GC_wait_builder/marker(),		*/
+/* GC_wait_for_reclaim(), GC_notify_all_builder/marker() are the same	*/
+/* as in pthread_support.c except that GC_generic_lock() is not used.	*/
+
+#ifdef LOCK_STATS
+  unsigned long GC_block_count = 0;
+#endif
+
+void GC_acquire_mark_lock(void)
+{
+    if (pthread_mutex_lock(&mark_mutex) != 0) {
+	ABORT("pthread_mutex_lock failed");
+    }
+#   ifdef LOCK_STATS
+	++GC_block_count;
+#   endif
+    /* GC_generic_lock(&mark_mutex); */
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NUMERIC_THREAD_ID(pthread_self());
+#   endif
+}
+
+void GC_release_mark_lock(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == NUMERIC_THREAD_ID(pthread_self()));
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NO_THREAD;
+#   endif
+    if (pthread_mutex_unlock(&mark_mutex) != 0) {
+	ABORT("pthread_mutex_unlock failed");
+    }
+}
+
+/* Collector must wait for a freelist builders for 2 reasons:		*/
+/* 1) Mark bits may still be getting examined without lock.		*/
+/* 2) Partial free lists referenced only by locals may not be scanned 	*/
+/*    correctly, e.g. if they contain "pointer-free" objects, since the	*/
+/*    free-list link may be ignored.					*/
+/* STATIC */ void GC_wait_builder(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == NUMERIC_THREAD_ID(pthread_self()));
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NO_THREAD;
+#   endif
+    if (pthread_cond_wait(&builder_cv, &mark_mutex) != 0) {
+	ABORT("pthread_cond_wait failed");
+    }
+    GC_ASSERT(GC_mark_lock_holder == NO_THREAD);
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NUMERIC_THREAD_ID(pthread_self());
+#   endif
+}
+
+void GC_wait_for_reclaim(void)
+{
+    GC_acquire_mark_lock();
+    while (GC_fl_builder_count > 0) {
+	GC_wait_builder();
+    }
+    GC_release_mark_lock();
+}
+
+void GC_notify_all_builder(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == NUMERIC_THREAD_ID(pthread_self()));
+    if (pthread_cond_broadcast(&builder_cv) != 0) {
+	ABORT("pthread_cond_broadcast failed");
+    }
+}
+
+STATIC pthread_cond_t mark_cv = PTHREAD_COND_INITIALIZER;
+
+void GC_wait_marker(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == NUMERIC_THREAD_ID(pthread_self()));
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NO_THREAD;
+#   endif
+    if (pthread_cond_wait(&mark_cv, &mark_mutex) != 0) {
+	ABORT("pthread_cond_wait failed");
+    }
+    GC_ASSERT(GC_mark_lock_holder == NO_THREAD);
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NUMERIC_THREAD_ID(pthread_self());
+#   endif
+}
+
+void GC_notify_all_marker(void)
+{
+    if (pthread_cond_broadcast(&mark_cv) != 0) {
+	ABORT("pthread_cond_broadcast failed");
+    }
+}
+
+#else /* ! GC_PTHREADS */
+
+STATIC void start_mark_threads(void)
+{
+      int i;
+      GC_uintptr_t handle;
+      unsigned thread_id;
+
+      for (i = 0; i < GC_markers - 1; ++i) {
+	handle = _beginthreadex(NULL /* security_attr */, 0 /* stack_size */,
+			GC_mark_thread, (void *)(word)i, 0 /* flags */,
+			&thread_id);
+	if (!handle || handle == (GC_uintptr_t)-1L)
+	  WARN("Marker thread creation failed\n", 0);
+	else { /* We may detach the thread (if handle is of HANDLE type) */
+	  /* CloseHandle((HANDLE)handle); */
+	}
+      }
+}
+
+STATIC HANDLE mark_mutex_event = (HANDLE)0; /* Event with auto-reset. */
+volatile AO_t GC_mark_mutex_waitcnt = 0;	/* Number of waiters - 1; */
+					 	/* 0 - unlocked. */
+
+STATIC HANDLE builder_cv = (HANDLE)0; /* Event with manual reset */
+
+/* mark_mutex_event, builder_cv, mark_cv are initialized in GC_thr_init(). */
+
+#ifdef MARKLOCK_USES_SPIN
+/* MARKLOCK_USES_SPIN has the opposite meaning to NO_PTHREAD_TRYLOCK. */
+
+/* GC_pause() is the same as in pthread_support.c */
+
+/* Spend a few cycles in a way that can't introduce contention with	*/
+/* other threads.							*/
+STATIC void GC_pause(void)
+{
+    int i;
+#   if !defined(__GNUC__) || defined(__INTEL_COMPILER)
+      volatile word dummy = 0;
+#   endif
+
+    for (i = 0; i < 10; ++i) { 
+#     if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+        __asm__ __volatile__ (" " : : : "memory");
+#     else
+	/* Something that's unlikely to be optimized away. */
+	GC_noop1(++dummy);
+#     endif
+    }
+}
+
+#ifndef SPIN_MAX
+#define SPIN_MAX 128	/* Maximum number of calls to GC_pause before	*/
+#endif			/* give up.					*/
+
+STATIC int GC_spin_and_trylock(volatile AO_t *pmutex_waitcnt)
+{
+  unsigned pause_length;
+  unsigned i;
+  GC_pause();
+  for (pause_length = 2; pause_length <= SPIN_MAX; pause_length <<= 1) {
+    if (AO_compare_and_swap(pmutex_waitcnt, 0, 1))
+      return 0;
+    for (i = 0; i < pause_length; ++i)
+      GC_pause();
+  }
+  /* Failed. */
+  /* Don't do final AO_compare_and_swap() after pausing. */
+  return -1;
+}
+
+#endif /* MARKLOCK_USES_SPIN */
+
+/* #define LOCK_STATS */
+#ifdef LOCK_STATS
+  unsigned long GC_spin_count = 0;
+  unsigned long GC_block_count = 0;
+  unsigned long GC_unlocked_count = 0;
+#endif
+
+void GC_acquire_mark_lock(void)
+{
+# ifdef MARKLOCK_USES_SPIN
+  /* Here we implement the same spinning algorithm as	*/
+  /* in GC_generic_lock() in pthread_support.c.	*/
+
+  if (!AO_compare_and_swap(&GC_mark_mutex_waitcnt, 0, 1)) {
+    if (GC_spin_and_trylock(&GC_mark_mutex_waitcnt) < 0) {
+# endif
+
+      if ((signed_word)AO_fetch_and_add1_full(&GC_mark_mutex_waitcnt) > 0) {
+#	ifdef LOCK_STATS
+          ++GC_block_count;
+#	endif
+        if (WaitForSingleObject(mark_mutex_event, INFINITE) == WAIT_FAILED)
+          ABORT("WaitForSingleObject() failed");
+      }
+#     ifdef LOCK_STATS
+        else {
+#	  ifdef MARKLOCK_USES_SPIN
+	    ++GC_spin_count;
+#	  else
+	    ++GC_unlocked_count;
+#	  endif
+	}
+#     endif
+  
+# ifdef MARKLOCK_USES_SPIN
+    }
+#   ifdef LOCK_STATS
+      else ++GC_spin_count;
+#   endif
+  }
+# ifdef LOCK_STATS
+    else ++GC_unlocked_count;
+# endif
+# endif
+  
+  GC_ASSERT(GC_mark_lock_holder == NO_THREAD);
+# ifdef GC_ASSERTIONS
+    GC_mark_lock_holder = (unsigned long)GetCurrentThreadId();
+# endif
+}
+
+void GC_release_mark_lock(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == (unsigned long)GetCurrentThreadId());
+#   ifdef GC_ASSERTIONS
+	GC_mark_lock_holder = NO_THREAD;
+#   endif
+    if ((signed_word)AO_fetch_and_sub1_full(&GC_mark_mutex_waitcnt) > 1 &&
+	 SetEvent(mark_mutex_event) == FALSE)
+	ABORT("SetEvent() failed");
+}
+
+/* In GC_wait_for_reclaim/GC_notify_all_builder() we emulate POSIX	*/
+/* cond_wait/cond_broadcast() primitives with WinAPI Event object	*/
+/* (working in "manual reset" mode).  This works here because		*/
+/* GC_notify_all_builder() is always called holding lock on		*/
+/* mark_mutex and the checked condition (GC_fl_builder_count == 0)	*/
+/* is the only one for which broadcasting on builder_cv is performed.	*/
+
+void GC_wait_for_reclaim(void)
+{
+    GC_ASSERT(builder_cv != 0);
+    for (;;) {
+	GC_acquire_mark_lock();
+	if (GC_fl_builder_count == 0)
+	    break;
+	if (ResetEvent(builder_cv) == FALSE)
+	    ABORT("ResetEvent() failed");
+	GC_release_mark_lock();
+	if (WaitForSingleObject(builder_cv, INFINITE) == WAIT_FAILED)
+	    ABORT("WaitForSingleObject() failed");
+    }
+    GC_release_mark_lock();
+}
+
+void GC_notify_all_builder(void)
+{
+    GC_ASSERT(GC_mark_lock_holder == (unsigned long)GetCurrentThreadId());
+    GC_ASSERT(builder_cv != 0);
+    GC_ASSERT(GC_fl_builder_count == 0);
+    if (SetEvent(builder_cv) == FALSE)
+	ABORT("SetEvent() failed");
+}
+
+/* For GC_wait_marker/GC_notify_all_marker() the above technique does	*/
+/* not work because they are used with different checked conditions in	*/
+/* different places (and, in addition, notifying is done after leaving	*/
+/* critical section) and this could result in a signal loosing between	*/
+/* checking for a particular condition and calling WaitForSingleObject.	*/
+/* So, we use "auto-reset" Event and propagate a broadcast manually	*/
+/* (unless the current thread is the only one waiting on mark_cv).	*/
+
+STATIC HANDLE mark_cv = (HANDLE)0; /* Event with auto-reset */
+
+STATIC unsigned mark_cv_waiter_cnt = 0;
+
+void GC_wait_marker(void)
+{
+    /* Here we assume that GC_wait_marker() is always called	*/
+    /* from a while(check_cond) loop.				*/
+    GC_ASSERT(mark_cv != 0);
+    mark_cv_waiter_cnt++;
+    GC_release_mark_lock();
+    if (WaitForSingleObject(mark_cv, INFINITE) == WAIT_FAILED)
+	ABORT("WaitForSingleObject() failed");
+    /* WaitForSingleObject() unblocks one thread and resets mark_cv */
+    GC_acquire_mark_lock();
+    /* Unblock one more waiting thread (if any) */
+    if (--mark_cv_waiter_cnt != 0 && SetEvent(mark_cv) == FALSE)
+	ABORT("SetEvent() failed");
+}
+
+void GC_notify_all_marker(void)
+{
+    GC_ASSERT(mark_cv != 0);
+    if (SetEvent(mark_cv) == FALSE)
+	ABORT("SetEvent() failed");
+}
+
+#endif /* ! GC_PTHREADS */
+
+#endif /* PARALLEL_MARK */
+
 #ifndef GC_PTHREADS
 
 /* We have no DllMain to take care of new threads.  Thus we	*/
@@ -1281,7 +1685,72 @@
 #   endif
 	GC_get_stack_base(&sb);
     GC_ASSERT(sb_result == GC_SUCCESS);
+    
+#   ifdef PARALLEL_MARK
+#     ifndef GC_PTHREADS
+	mark_mutex_event = CreateEventA(NULL /* attrs */,
+				FALSE /* isManualReset */,
+				FALSE /* initialState */, NULL /* name */);
+	if (mark_mutex_event == (HANDLE)0)
+	  ABORT("CreateEvent() failed");
+	
+	builder_cv = CreateEventA(NULL /* attrs */, TRUE /* isManualReset */,
+			FALSE /* initialState */, NULL /* name */);
+	mark_cv = CreateEventA(NULL /* attrs */, FALSE /* isManualReset */,
+			FALSE /* initialState */, NULL /* name */);
+	if (builder_cv == (HANDLE)0 || mark_cv == (HANDLE)0)
+	  ABORT("CreateEvent() failed");
+#     endif
+
+      /* Set GC_markers. */
+      {
+	char * markers_string = GETENV("GC_MARKERS");
+	if (markers_string != NULL) {
+	  GC_markers = atoi(markers_string);
+	  if (GC_markers > MAX_MARKERS) {
+	    WARN("Limiting number of mark threads\n", 0);
+	    GC_markers = MAX_MARKERS;
+	  }
+	} else {
+#	  ifdef _WIN64
+	    DWORD_PTR procMask = 0;
+	    DWORD_PTR sysMask;
+#	  else
+	    DWORD procMask = 0;
+	    DWORD sysMask;
+#	  endif
+	  int ncpu = 0;
+	  if (GetProcessAffinityMask(GetCurrentProcess(),
+	  			(void *)&procMask, (void *)&sysMask)
+	      && procMask) {
+	    do {
+	      ncpu++;
+	    } while ((procMask &= procMask - 1) != 0);
+	  }
+	  GC_markers = ncpu;
+	  if (GC_markers >= MAX_MARKERS)
+	    GC_markers = MAX_MARKERS; /* silently limit GC_markers value */
+	}
+      }	
+      if (GC_markers <= 1 || GC_win32_dll_threads) {
+	GC_parallel = FALSE;
+	GC_markers = 1;
+      } else {
+	GC_parallel = TRUE;
+	/* Disable true incremental collection, but generational is OK.	*/
+	GC_time_limit = GC_TIME_UNLIMITED;
+      }
+      if (GC_print_stats) {
+	GC_log_printf("Number of marker threads = %ld\n", GC_markers);
+      }
+#   endif
+    
     GC_register_my_thread(&sb);
+
+#   ifdef PARALLEL_MARK
+      /* If we are using a parallel marker, actually start helper threads.  */
+      if (GC_parallel) start_mark_threads();
+#   endif
 }
 
 #ifdef GC_PTHREADS
@@ -1544,7 +2013,7 @@
 #       endif
 	    GC_get_stack_base(&sb);
         GC_ASSERT(sb_result == GC_SUCCESS);
-#       ifdef THREAD_LOCAL_ALLOC
+#       if defined(THREAD_LOCAL_ALLOC) || defined(PARALLEL_MARK)
 	  ABORT("Cannot initialize thread local cache from DllMain");
 #       endif
 	GC_register_my_thread_inner(&sb, thread_id);
@@ -1561,9 +2030,11 @@
    case DLL_PROCESS_DETACH:
     {
       int i;
+      int my_max;
 
       if (!GC_win32_dll_threads) return TRUE;
-      for (i = 0; i <= GC_get_max_thread_index(); ++i)
+      my_max = (int)GC_get_max_thread_index();
+      for (i = 0; i <= my_max; ++i)
       {
           if (AO_load(&(dll_thread_table[i].in_use)))
 	    GC_delete_gc_thread(dll_thread_table + i);
@@ -1611,7 +2082,7 @@
 #if defined(USE_PTHREAD_LOCKS)
   /* Support for pthread locking code.		*/
   /* Pthread_mutex_try_lock may not win here,	*/
-  /* due to builtinsupport for spinning first?	*/
+  /* due to builtin support for spinning first?	*/
 
 volatile GC_bool GC_collecting = 0;
 			/* A hint that we're in the collector and       */


More information about the Gc mailing list