diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index f33c72d4cf4d2a..5ab6d00fb0e510 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -112,6 +112,10 @@ struct _py_trashcan {
     PyObject *delete_later;
 };
 
+// We can't include pythread.h yet (due to include cycles)
+// so we duplicate PyThread_type_lock here.
+typedef void *PyThread_type_lock;
+
 struct _ts {
     /* See Python/ceval.c for comments explaining most fields */
 
@@ -188,29 +192,48 @@ struct _ts {
 
     struct _py_trashcan trash;
 
-    /* Called when a thread state is deleted normally, but not when it
-     * is destroyed after fork().
-     * Pain:  to prevent rare but fatal shutdown errors (issue 18808),
-     * Thread.join() must wait for the join'ed thread's tstate to be unlinked
-     * from the tstate chain.  That happens at the end of a thread's life,
-     * in pystate.c.
-     * The obvious way doesn't quite work:  create a lock which the tstate
-     * unlinking code releases, and have Thread.join() wait to acquire that
-     * lock.  The problem is that we _are_ at the end of the thread's life:
-     * if the thread holds the last reference to the lock, decref'ing the
-     * lock will delete the lock, and that may trigger arbitrary Python code
-     * if there's a weakref, with a callback, to the lock.  But by this time
-     * _PyRuntime.gilstate.tstate_current is already NULL, so only the simplest
-     * of C code can be allowed to run (in particular it must not be possible to
-     * release the GIL).
-     * So instead of holding the lock directly, the tstate holds a weakref to
-     * the lock:  that's the value of on_delete_data below.  Decref'ing a
-     * weakref is harmless.
-     * on_delete points to _threadmodule.c's static release_sentinel() function.
-     * After the tstate is unlinked, release_sentinel is called with the
-     * weakref-to-lock (on_delete_data) argument, and release_sentinel releases
-     * the indirectly held lock.
-     */
+    struct {
+        /* Called when a thread state is deleted normally, but not when
+         * it is destroyed after fork().
+         *
+         * Pain:  to prevent rare but fatal shutdown errors (issue 18808),
+         * Thread.join() must wait for the join'ed thread's tstate to be
+         * unlinked from the tstate chain.  That happens at the end of a
+         * thread's life, in pystate.c.
+         *
+         * The obvious way doesn't quite work:  create a lock which the
+         * tstate unlinking code releases, and have Thread.join() wait
+         * to acquire that lock.  The problem is that we _are_ at the end
+         * of the thread's life:  if the thread holds the last reference
+         * to the lock, decref'ing the lock will delete the lock, and
+         * that may trigger arbitrary Python code if there's a weakref,
+         * with a callback, to the lock.  But by this time the current
+         * tstate is already NULL, so only the simplest of C code can be
+         * allowed to run (in particular it must not be possible to
+         * release the GIL).
+         *
+         * So instead of holding the lock directly, the tstate holds
+         * a weakref to the lock:  that's the value of .lock_weakref
+         * below.  Decref'ing a weakref is harmless.
+         *
+         * .pre_delete points to the thread_prepare_delete static function
+         * in _threadmodule.c.  It gets called right _before_ the tstate
+         * is deleted, with the GIL held, and returns the lock to release.
+         * It also adds a pending call to delete the lock once the lock
+         * has been released.  This works because the pending call won't
+         * run until another thread of the interpreter takes the GIL.
+         *
+         * It is important that the lock be released _after_ the GIL
+         * is released to avoid a race with threading._shutdown().
+         */
+        // XXX Does all of the above explanation still hold?
+        PyThread_type_lock (*pre_delete)(PyThreadState *);
+        PyObject *lock_weakref;
+    } _threading_thread;
+    /* These weren't ever meant to be used except internally,
+     * but we're keeping them around just in case.  Internally,
+     * we use the _threading_thread field. */
+    /* XXX Drop them! */
     void (*on_delete)(void *);
     void *on_delete_data;
 
diff --git a/Include/internal/pycore_ceval_state.h b/Include/internal/pycore_ceval_state.h
index 95d1fa16ba40dc..88c5aa91680d23 100644
--- a/Include/internal/pycore_ceval_state.h
+++ b/Include/internal/pycore_ceval_state.h
@@ -71,7 +71,7 @@ struct _pending_calls {
        thread state.
        Guarded by the GIL. */
     int async_exc;
-#define NPENDINGCALLS 32
+#define NPENDINGCALLS 100
     struct {
         int (*func)(void *);
         void *arg;
diff --git a/Modules/_threadmodule.c b/Modules/_threadmodule.c
index 5d753b4a0ebc5e..4c80beb21f2b0a 100644
--- a/Modules/_threadmodule.c
+++ b/Modules/_threadmodule.c
@@ -1307,56 +1307,136 @@ yet finished.\n\
 This function is meant for internal and specialized purposes only.\n\
 In most applications `threading.enumerate()` should be used instead.");
 
-static void
-release_sentinel(void *wr_raw)
+
+/* Here we're essentinally cleaning up after a thread that finished
+   and has aleady been deallocated (both the threading.Thread and
+   the tstate).  Thus it will run in a different thread with the
+   same interpreter (via a "pending call"). */
+static int
+clean_up_sentinel(void *data)
 {
-    PyObject *wr = _PyObject_CAST(wr_raw);
-    /* Tricky: this function is called when the current thread state
-       is being deleted.  Therefore, only simple C code can safely
-       execute here. */
-    PyObject *obj = PyWeakref_GET_OBJECT(wr);
-    lockobject *lock;
-    if (obj != Py_None) {
-        lock = (lockobject *) obj;
-        if (lock->locked) {
-            PyThread_release_lock(lock->lock_lock);
-            lock->locked = 0;
+    PyObject *lockobj = (PyObject *)data;
+    assert(lockobj != NULL);
+    PyThread_type_lock lock = ((lockobject *)lockobj)->lock_lock;
+
+    /* Wait here until we know for sure that he thread running
+       _PyThreadState_DeleteCurrent() has released the lock. */
+    if (acquire_timed(lock, 0) == PY_LOCK_ACQUIRED) {
+        /* _PyThreadState_DeleteCurrent() finished, so we can proceed. */
+        PyThread_release_lock(lock);
+        ((lockobject *)lockobj)->locked = 0;
+    }
+    else if (((lockobject *)lockobj)->locked == 2) {
+        /* _PyThreadState_DeleteCurrent() is still holding
+           the lock, so we will wait here until it is released.
+           We don't need to hold the GIL while we wait though. */
+        ((lockobject *)lockobj)->locked = 1;
+
+        PyLockStatus r = acquire_timed(lock, -1);
+        // XXX Why do we have to loop?
+        while (r == PY_LOCK_FAILURE) {
+            r = acquire_timed(lock, -1);
         }
+        PyThread_release_lock(lock);
+        ((lockobject *)lockobj)->locked = 0;
     }
+    /* Otherwise the current thread acquired the lock right before
+       its eval loop was interrupted to run this pending call.
+       We can simply let it proceed. */
+
+    /* In all cases, at this point we are done with the lock. */
+    Py_DECREF(lockobj);
+    return 0;
+}
+
+static PyThread_type_lock
+thread_prepare_delete(PyThreadState *tstate)
+{
+    assert(tstate->_threading_thread.pre_delete != NULL);
+    PyThread_type_lock lock = NULL;
+
+    /* Tricky: this function is called when the current thread state
+       is being deleted.  Therefore, only simple C code can safely
+       execute here.  The GIL is still held. */
+
+    PyObject *wr = tstate->_threading_thread.lock_weakref;
+    assert(wr != NULL);
+    PyObject *lockobj = PyWeakref_GET_OBJECT(wr);
+    if (lockobj == Py_None) {
+        /* The thread has already been destroyed, so we can clean up now. */
+        goto done;
+    }
+    if (_PyThreadState_GET() != tstate) {
+        assert(PyThread_get_thread_ident() != tstate->thread_id);
+        /* It must be a daemon thread that was killed during
+         * interp/runtime finalization, so there's nothing to do. */
+        goto done;
+    }
+    assert(((lockobject *)lockobj)->locked == 1);
+    assert(acquire_timed(((lockobject *)lockobj)->lock_lock, 0) == PY_LOCK_FAILURE);
+    /* We cheat a little here to allow clean_up_sentinel() to know
+       that this thread is still holding the lock.  The value will be
+       reset to the normal 0 or 1 as soon as any other thread
+       uses the lock. */
+    ((lockobject *)lockobj)->locked = 2;
+
+    /* We need to prevent the underlying PyThread_type_lock from getting
+       destroyed before we release it in _PyThreadState_DeleteCurrent(),
+       However, we don't need the weakref any more. */
+    Py_INCREF(lockobj);
+
+    /* The pending call will be run the next time the GIL is taken
+       by one of this interpreter's threads. */
+    void *data = (void *) lockobj;
+    if (Py_AddPendingCall(clean_up_sentinel, data) < 0) {
+        Py_DECREF(lockobj);
+        /* We otherwise ignore the error.  A non-zero value means
+           there were too many pending calls already queued up.
+           This case is unlikely, and, at worst,
+           we'll just leak the lock.
+           */
+        goto done;
+    }
+
+    lock = ((lockobject *)lockobj)->lock_lock;
+
+done:
     /* Deallocating a weakref with a NULL callback only calls
        PyObject_GC_Del(), which can't call any Python code. */
     Py_DECREF(wr);
+    tstate->_threading_thread.pre_delete = NULL;
+    tstate->_threading_thread.lock_weakref = NULL;
+    return lock;
 }
 
 static PyObject *
 thread__set_sentinel(PyObject *module, PyObject *Py_UNUSED(ignored))
 {
-    PyObject *wr;
     PyThreadState *tstate = _PyThreadState_GET();
-    lockobject *lock;
 
-    if (tstate->on_delete_data != NULL) {
+    if (tstate->_threading_thread.lock_weakref != NULL) {
         /* We must support the re-creation of the lock from a
            fork()ed child. */
-        assert(tstate->on_delete == &release_sentinel);
-        wr = (PyObject *) tstate->on_delete_data;
-        tstate->on_delete = NULL;
-        tstate->on_delete_data = NULL;
-        Py_DECREF(wr);
-    }
-    lock = newlockobject(module);
-    if (lock == NULL)
+        assert(tstate->_threading_thread.pre_delete == &thread_prepare_delete);
+        tstate->_threading_thread.pre_delete = NULL;
+        tstate->_threading_thread.lock_weakref = NULL;
+        Py_DECREF(tstate->_threading_thread.lock_weakref);
+    }
+
+    PyObject *lockobj = (PyObject *) newlockobject(module);
+    if (lockobj == NULL) {
         return NULL;
+    }
     /* The lock is owned by whoever called _set_sentinel(), but the weakref
        hangs to the thread state. */
-    wr = PyWeakref_NewRef((PyObject *) lock, NULL);
+    PyObject *wr = PyWeakref_NewRef(lockobj, NULL);
     if (wr == NULL) {
-        Py_DECREF(lock);
+        Py_DECREF(lockobj);
         return NULL;
     }
-    tstate->on_delete_data = (void *) wr;
-    tstate->on_delete = &release_sentinel;
-    return (PyObject *) lock;
+    tstate->_threading_thread.pre_delete = &thread_prepare_delete;
+    tstate->_threading_thread.lock_weakref = wr;
+    return lockobj;
 }
 
 PyDoc_STRVAR(_set_sentinel_doc,
diff --git a/Python/pystate.c b/Python/pystate.c
index 25e655a2027918..7c940aae628c9f 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -1490,6 +1490,18 @@ PyThreadState_Clear(PyThreadState *tstate)
 
     Py_CLEAR(tstate->context);
 
+    if (tstate != current_fast_get(&_PyRuntime)) {
+        /* The "current" case is handled in _PyThreadState_DeleteCurrent(). */
+        if (tstate->_threading_thread.pre_delete != NULL) {
+#ifdef NDEBUG
+            (void) tstate->_threading_thread.pre_delete(tstate);
+#else
+            PyThread_type_lock lock;
+            lock = tstate->_threading_thread.pre_delete(tstate);
+            assert(lock == NULL);
+#endif
+        }
+    }
     if (tstate->on_delete != NULL) {
         tstate->on_delete(tstate->on_delete_data);
     }
@@ -1505,6 +1517,7 @@ static void
 tstate_delete_common(PyThreadState *tstate)
 {
     assert(tstate->_status.cleared && !tstate->_status.finalized);
+    // XXX assert(tstate->_threading_thread.pre_delete == NULL);
 
     PyInterpreterState *interp = tstate->interp;
     if (interp == NULL) {
@@ -1565,9 +1578,33 @@ void
 _PyThreadState_DeleteCurrent(PyThreadState *tstate)
 {
     _Py_EnsureTstateNotNULL(tstate);
+
+    PyThread_type_lock lock = NULL;
+    if (tstate->_threading_thread.pre_delete != NULL) {
+        /* This may queue up a pending call that will run in a
+           different thread in the same interpreter.  It will only
+           run _after_ we've released the GIL below.
+
+           lock will be NULL if the threading.Thread has already been
+           destroyed, if there are too many pending calls already queued
+           up, or if _PyThreadState_DeleteCurrent() wasn't called by
+           thread_run() (in _threadmodule.c).
+           */
+        lock = tstate->_threading_thread.pre_delete(tstate);
+    }
+
     tstate_delete_common(tstate);
     current_fast_clear(tstate->interp->runtime);
     _PyEval_ReleaseLock(tstate);
+
+    if (lock != NULL) {
+        /* Notify threading._shutdown() that this thread has been finalized.
+           This must happen *after* the GIL is released,
+           to avoid a race with threading._shutdown()
+           (via wait_for_thread_shutdown() in pylifecycle.c).. */
+        PyThread_release_lock(lock);
+    }
+
     free_threadstate(tstate);
 }
 
diff --git a/Tools/c-analyzer/c_analyzer/analyze.py b/Tools/c-analyzer/c_analyzer/analyze.py
index 267d058e07abdb..662bb44ba6704d 100644
--- a/Tools/c-analyzer/c_analyzer/analyze.py
+++ b/Tools/c-analyzer/c_analyzer/analyze.py
@@ -174,7 +174,14 @@ def find_typedecl(decl, typespec, typespecs):
             # If the decl is in a source file then we expect the
             # type to be in the same file or in a header file.
             continue
-        candidates.append(typedecl)
+        for c in candidates:
+            if c.name == typedecl.name and c.data == typedecl.data:
+                # The type was duplicated in another file.
+                assert c.parent == typedecl.parent
+                break
+        else:
+            candidates.append(typedecl)
+
     if not candidates:
         return None
     elif len(candidates) == 1: