diff --git a/src/Makefile b/src/Makefile
index 1a9af2fa7c439..58ba268d482f2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -43,7 +43,7 @@ RUNTIME_SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array dump staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile \
-	threading partr stackwalk gc gc-debug gc-pages gc-stacks method \
+	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
 	jlapi signal-handling safepoint timing subtype \
 	crc32c APInt-C processor ircode opaque_closure
 SRCS := jloptions runtime_ccall rtutils
diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp
new file mode 100644
index 0000000000000..ce6adc4c8b787
--- /dev/null
+++ b/src/gc-alloc-profiler.cpp
@@ -0,0 +1,148 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "gc-alloc-profiler.h"
+
+#include "julia_internal.h"
+#include "gc.h"
+
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+struct jl_raw_backtrace_t {
+    jl_bt_element_t *data;
+    size_t size;
+};
+
+struct jl_raw_alloc_t {
+    jl_datatype_t *type_address;
+    jl_raw_backtrace_t backtrace;
+    size_t size;
+};
+
+// == These structs define the global singleton profile buffer that will be used by
+// callbacks to store profile results. ==
+struct jl_per_thread_alloc_profile_t {
+    vector<jl_raw_alloc_t> allocs;
+};
+
+struct jl_alloc_profile_t {
+    double sample_rate;
+
+    vector<jl_per_thread_alloc_profile_t> per_thread_profiles;
+};
+
+struct jl_combined_results {
+    vector<jl_raw_alloc_t> combined_allocs;
+};
+
+// == Global variables manipulated by callbacks ==
+
+jl_alloc_profile_t g_alloc_profile;
+int g_alloc_profile_enabled = false;
+jl_combined_results g_combined_results; // Will live forever.
+
+// === stack stuff ===
+
+jl_raw_backtrace_t get_raw_backtrace() JL_NOTSAFEPOINT {
+    // We first record the backtrace onto a MAX-sized buffer, so that we don't have to
+    // allocate the buffer until we know the size. To ensure thread-safety, we use a
+    // per-thread backtrace buffer.
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_bt_element_t *shared_bt_data_buffer = ptls->profiling_bt_buffer;
+    if (shared_bt_data_buffer == NULL) {
+        size_t size = sizeof(jl_bt_element_t) * (JL_MAX_BT_SIZE + 1);
+        shared_bt_data_buffer = (jl_bt_element_t*) malloc_s(size);
+        ptls->profiling_bt_buffer = shared_bt_data_buffer;
+    }
+
+    size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2);
+
+    // Then we copy only the needed bytes out of the buffer into our profile.
+    size_t bt_bytes = bt_size * sizeof(jl_bt_element_t);
+    jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc_s(bt_bytes);
+    memcpy(bt_data, shared_bt_data_buffer, bt_bytes);
+
+
+    return jl_raw_backtrace_t{
+        bt_data,
+        bt_size
+    };
+}
+
+// == exported interface ==
+
+extern "C" {  // Needed since these functions doesn't take any arguments.
+
+JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) {
+    // We only need to do this once, the first time this is called.
+    while (g_alloc_profile.per_thread_profiles.size() < jl_n_threads) {
+        g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{});
+    }
+
+    g_alloc_profile.sample_rate = sample_rate;
+    g_alloc_profile_enabled = true;
+}
+
+JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile() {
+    // combine allocs
+    // TODO: interleave to preserve ordering
+    for (auto& profile : g_alloc_profile.per_thread_profiles) {
+        for (const auto& alloc : profile.allocs) {
+            g_combined_results.combined_allocs.push_back(alloc);
+        }
+
+        profile.allocs.clear();
+    }
+
+    return jl_profile_allocs_raw_results_t{
+        g_combined_results.combined_allocs.data(),
+        g_combined_results.combined_allocs.size(),
+    };
+}
+
+JL_DLLEXPORT void jl_stop_alloc_profile() {
+    g_alloc_profile_enabled = false;
+}
+
+JL_DLLEXPORT void jl_free_alloc_profile() {
+    // Free any allocs that remain in the per-thread profiles, that haven't
+    // been combined yet (which happens in fetch_alloc_profiles()).
+    for (auto& profile : g_alloc_profile.per_thread_profiles) {
+        for (auto alloc : profile.allocs) {
+            free(alloc.backtrace.data);
+        }
+        profile.allocs.clear();
+    }
+
+    // Free the allocs that have been already combined into the combined results object.
+    for (auto alloc : g_combined_results.combined_allocs) {
+        free(alloc.backtrace.data);
+    }
+
+    g_combined_results.combined_allocs.clear();
+}
+
+// == callback called into by the outside ==
+
+void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *type) JL_NOTSAFEPOINT {
+    auto& global_profile = g_alloc_profile;
+    auto thread_id = jl_atomic_load_relaxed(&jl_current_task->tid);
+    auto& profile = global_profile.per_thread_profiles[thread_id];
+
+    auto sample_val = double(rand()) / double(RAND_MAX);
+    auto should_record = sample_val <= global_profile.sample_rate;
+    if (!should_record) {
+        return;
+    }
+
+    profile.allocs.emplace_back(jl_raw_alloc_t{
+        type,
+        get_raw_backtrace(),
+        size
+    });
+}
+
+}  // extern "C"
diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h
new file mode 100644
index 0000000000000..8be6fed21a899
--- /dev/null
+++ b/src/gc-alloc-profiler.h
@@ -0,0 +1,51 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef JL_GC_ALLOC_PROFILER_H
+#define JL_GC_ALLOC_PROFILER_H
+
+#include "julia.h"
+#include "ios.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ---------------------------------------------------------------------
+// The public interface to call from Julia for allocations profiling
+// ---------------------------------------------------------------------
+
+// Forward-declaration to avoid depenency in header file.
+struct jl_raw_alloc_t;  // Defined in gc-alloc-profiler.cpp
+
+typedef struct {
+    struct jl_raw_alloc_t *allocs;
+    size_t num_allocs;
+} jl_profile_allocs_raw_results_t;
+
+JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate);
+JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile(void);
+JL_DLLEXPORT void jl_stop_alloc_profile(void);
+JL_DLLEXPORT void jl_free_alloc_profile(void);
+
+// ---------------------------------------------------------------------
+// Functions to call from GC when alloc profiling is enabled
+// ---------------------------------------------------------------------
+
+void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT;
+
+extern int g_alloc_profile_enabled;
+
+#define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03)
+
+static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT {
+    if (__unlikely(g_alloc_profile_enabled)) {
+        _maybe_record_alloc_to_profile(val, size, typ);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  // JL_GC_ALLOC_PROFILER_H
diff --git a/src/gc.c b/src/gc.c
index 1ba8f65c09efd..f6e0dbf6d2499 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -942,7 +942,7 @@ static void sweep_weak_refs(void)
 // big value list
 
 // Size includes the tag and the tag is not cleared!!
-JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
+static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
 {
     maybe_collect(ptls);
     size_t offs = offsetof(bigval_t, header);
@@ -970,6 +970,22 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
+// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
+{
+    jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
+
+    maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
+    return val;
+}
+
+// This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into
+// its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner`
+// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz) {
+    return jl_gc_big_alloc_inner(ptls, sz);
+}
+
 // Sweep list rooted at *pv, removing and freeing any unmarked objects.
 // Return pointer to last `next` field in the culled list.
 static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
@@ -1195,7 +1211,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
 }
 
 // Size includes the tag and the tag is not cleared!!
-JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
+static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     // Use the pool offset instead of the pool address as the argument
@@ -1251,6 +1267,23 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
     return jl_valueof(v);
 }
 
+// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
+                                          int osize)
+{
+    jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
+
+    maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
+    return val;
+}
+
+// This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into
+// its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner`
+// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize) {
+    return jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -3505,6 +3538,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     SetLastError(last_error);
 #endif
     errno = last_errno;
+    // jl_gc_managed_malloc is currently always used for allocating array buffers.
+    maybe_record_alloc_to_profile(b, sz, (jl_datatype_t*)jl_buff_tag);
     return b;
 }
 
@@ -3546,7 +3581,7 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
     SetLastError(last_error);
 #endif
     errno = last_errno;
-
+    maybe_record_alloc_to_profile(b, sz, jl_gc_unknown_type_tag);
     return b;
 }
 
diff --git a/src/gc.h b/src/gc.h
index 19fe3401665d1..bb2d87c0b316b 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -26,6 +26,7 @@
 #endif
 #endif
 #include "julia_assert.h"
+#include "gc-alloc-profiler.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index fbded1926dfb6..cd8e777e2aa3c 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -5,6 +5,7 @@
 
 #include "options.h"
 #include "julia_locks.h"
+#include "gc-alloc-profiler.h"
 #include <uv.h>
 #if !defined(_WIN32)
 #include <unistd.h>
@@ -225,9 +226,9 @@ extern jl_array_t *jl_all_methods JL_GLOBALLY_ROOTED;
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
-JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
-                                          int osize);
-JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t allocsz);
+jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
+                                   int osize);
+jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 int jl_gc_classify_pools(size_t sz, int *osize);
 extern jl_mutex_t gc_perm_lock;
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero,
@@ -336,14 +337,17 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
         int pool_id = jl_gc_szclass(allocsz);
         jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
         int osize = jl_gc_sizeclasses[pool_id];
-        v = jl_gc_pool_alloc(ptls, (char*)p - (char*)ptls, osize);
+        // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
             jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc(ptls, allocsz);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
     }
     jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
     return v;
 }
 
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 57270832123c3..549981c2137c2 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -238,6 +238,8 @@ typedef struct _jl_tls_states_t {
     // Temporary backtrace buffer. Scanned for gc roots when bt_size > 0.
     struct _jl_bt_element_t *bt_data; // JL_MAX_BT_SIZE + 1 elements long
     size_t bt_size;    // Size for backtrace in transit in bt_data
+    // Temporary backtrace buffer used only for allocations profiler.
+    struct _jl_bt_element_t *profiling_bt_buffer;
     // Atomically set by the sender, reset by the handler.
     volatile _Atomic(sig_atomic_t) signal_request; // TODO: no actual reason for this to be _Atomic
     // Allow the sigint to be raised asynchronously
diff --git a/stdlib/Profile/src/Allocs.jl b/stdlib/Profile/src/Allocs.jl
new file mode 100644
index 0000000000000..15a7afbff76fe
--- /dev/null
+++ b/stdlib/Profile/src/Allocs.jl
@@ -0,0 +1,210 @@
+module Allocs
+
+using Base.StackTraces: StackTrace, StackFrame, lookup
+using Base: InterpreterIP
+
+# --- Raw results structs, originally defined in C ---
+
+# The C jl_bt_element_t object contains either an IP pointer (size_t) or a void*.
+const BTElement = Csize_t;
+
+# matches jl_raw_backtrace_t on the C side
+struct RawBacktrace
+    data::Ptr{BTElement} # in C: *jl_bt_element_t
+    size::Csize_t
+end
+
+# matches jl_raw_alloc_t on the C side
+struct RawAlloc
+    type::Ptr{Type}
+    backtrace::RawBacktrace
+    size::Csize_t
+end
+
+# matches jl_profile_allocs_raw_results_t on the C side
+struct RawResults
+    allocs::Ptr{RawAlloc}
+    num_allocs::Csize_t
+end
+
+"""
+    Profile.Allocs.@profile [sample_rate=0.0001] expr
+
+Profile allocations that happen during `expr`, returning
+both the result and and AllocResults struct.
+
+A sample rate of 1.0 will record everything; 0.0 will record nothing.
+
+```julia
+julia> Profile.Allocs.@profile sample_rate=0.01 peakflops()
+1.03733270279065e11
+
+julia> results = Profile.Allocs.fetch()
+
+julia> last(sort(results.allocs, by=x->x.size))
+Profile.Allocs.Alloc(Vector{Any}, Base.StackTraces.StackFrame[_new_array_ at array.c:127, ...], 5576)
+```
+
+!!! note
+    The current implementation of the Allocations Profiler does not
+    capture types for all allocations. Allocations for which the profiler
+    could not capture the type are represented as having type
+    `Profile.Allocs.UnknownType`.
+
+    You can read more about the missing types and the plan to improve this, here:
+    https://github.com/JuliaLang/julia/issues/43688.
+
+!!! compat "Julia 1.8"
+    The allocation profiler was added in Julia 1.8.
+"""
+macro profile(opts, ex)
+    _prof_expr(ex, opts)
+end
+macro profile(ex)
+    _prof_expr(ex, :(sample_rate=0.0001))
+end
+
+function _prof_expr(expr, opts)
+    quote
+        $start(; $(esc(opts)))
+        try
+            $(esc(expr))
+        finally
+            $stop()
+        end
+    end
+end
+
+"""
+    Profile.Allocs.start(sample_rate::Real)
+
+Begin recording allocations with the given sample rate
+A sample rate of 1.0 will record everything; 0.0 will record nothing.
+"""
+function start(; sample_rate::Real)
+    ccall(:jl_start_alloc_profile, Cvoid, (Cdouble,), Float64(sample_rate))
+end
+
+"""
+    Profile.Allocs.stop()
+
+Stop recording allocations.
+"""
+function stop()
+    ccall(:jl_stop_alloc_profile, Cvoid, ())
+end
+
+"""
+    Profile.Allocs.clear()
+
+Clear all previously profiled allocation information from memory.
+"""
+function clear()
+    ccall(:jl_free_alloc_profile, Cvoid, ())
+    return nothing
+end
+
+"""
+    Profile.Allocs.fetch()
+
+Retrieve the recorded allocations, and decode them into Julia
+objects which can be analyzed.
+"""
+function fetch()
+    raw_results = ccall(:jl_fetch_alloc_profile, RawResults, ())
+    return decode(raw_results)
+end
+
+# decoded results
+
+struct Alloc
+    type::Any
+    stacktrace::StackTrace
+    size::Int
+end
+
+struct AllocResults
+    allocs::Vector{Alloc}
+end
+
+# Without this, the Alloc's stacktrace prints for lines and lines and lines...
+function Base.show(io::IO, a::Alloc)
+    stacktrace_sample = length(a.stacktrace) >= 1 ? "$(a.stacktrace[1]), ..." : ""
+    print(io, "$Alloc($(a.type), $StackFrame[$stacktrace_sample], $(a.size))")
+end
+
+const BacktraceCache = Dict{BTElement,Vector{StackFrame}}
+
+# copied from julia_internal.h
+const JL_BUFF_TAG = UInt(0x4eadc000)
+const JL_GC_UNKNOWN_TYPE_TAG = UInt(0xdeadaa03)
+
+struct CorruptType end
+struct BufferType end
+struct UnknownType end
+
+function load_type(ptr::Ptr{Type})
+    if UInt(ptr) < UInt(4096)
+        return CorruptType
+    elseif UInt(ptr) == JL_BUFF_TAG
+        return BufferType
+    elseif UInt(ptr) == JL_GC_UNKNOWN_TYPE_TAG
+        return UnknownType
+    end
+    return unsafe_pointer_to_objref(ptr)
+end
+
+function decode_alloc(cache::BacktraceCache, raw_alloc::RawAlloc)::Alloc
+    Alloc(
+        load_type(raw_alloc.type),
+        stacktrace_memoized(cache, load_backtrace(raw_alloc.backtrace)),
+        UInt(raw_alloc.size)
+    )
+end
+
+function decode(raw_results::RawResults)::AllocResults
+    cache = BacktraceCache()
+    allocs = [
+        decode_alloc(cache, unsafe_load(raw_results.allocs, i))
+        for i in 1:raw_results.num_allocs
+    ]
+    return AllocResults(allocs)
+end
+
+function load_backtrace(trace::RawBacktrace)::Vector{BTElement}
+    out = Vector{BTElement}()
+    for i in 1:trace.size
+        push!(out, unsafe_load(trace.data, i))
+    end
+
+    return out
+end
+
+function stacktrace_memoized(
+    cache::BacktraceCache,
+    trace::Vector{BTElement},
+    c_funcs::Bool=true
+)::StackTrace
+    stack = StackTrace()
+    for ip in trace
+        frames = get(cache, ip) do
+            res = lookup(ip)
+            cache[ip] = res
+            return res
+        end
+        for frame in frames
+            # Skip frames that come from C calls.
+            if c_funcs || !frame.from_c
+                push!(stack, frame)
+            end
+        end
+    end
+    return stack
+end
+
+# Precompile once for the package cache.
+@assert precompile(start, ())
+@assert precompile(stop, ())
+@assert precompile(fetch, ())
+
+end
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index f297ad12f80a1..b187bcefb928c 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -865,4 +865,6 @@ warning_empty() = @warn """
             running it multiple times), or adjust the delay between samples with
             `Profile.init()`."""
 
+include("Allocs.jl")
+
 end # module
diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl
new file mode 100644
index 0000000000000..b8d6222d07567
--- /dev/null
+++ b/stdlib/Profile/test/allocs.jl
@@ -0,0 +1,122 @@
+using Test
+using Profile: Allocs
+
+@testset "alloc profiler doesn't segfault" begin
+    res = Allocs.@profile sample_rate=1.0 begin
+        # test the allocations during compilation
+        using Base64
+    end
+    profile = Allocs.fetch()
+
+    @test length(profile.allocs) > 0
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+end
+
+@testset "alloc profiler works when there are multiple tasks on multiple threads" begin
+    NUM_TASKS = 1000
+
+    # This test is only really meaningful if we're running on
+    # multiple threads, but this isn't true on the windows tests,
+    # causing them to fail. So, commenting this assertion out.
+    # @test Threads.nthreads() > 1
+
+    function do_work()
+        ch = Channel{Vector{Float64}}(Inf)
+        @sync for i in 1:NUM_TASKS
+            Threads.@spawn begin
+                # generate garbage
+                put!(ch, zeros(100))
+            end
+        end
+        close(ch)
+    end
+
+    # call once to make sure it's compiled
+    precompile(do_work, ())
+    do_work()
+
+    res = Allocs.@profile sample_rate=1 begin
+        do_work()
+    end
+    profile = Allocs.fetch()
+
+    # expecting at least 2 allocations per task:
+    # 1. the task
+    # 2. the vector
+    @test length(profile.allocs) >= 2*NUM_TASKS
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+
+    @testset for type in (Task, Vector{Float64},)
+        @test length(filter(a->a.type <: type, profile.allocs)) >= NUM_TASKS
+    end
+
+    # TODO: it would be nice to assert that these tasks
+    # were actually scheduled onto multiple threads,
+    # and we see allocs from all threads in the profile
+end
+
+@testset "alloc profiler start stop fetch clear" begin
+    function do_work()
+        # Compiling allocates a lot
+        for f in (gensym() for _ in 1:10)
+            @eval begin
+                $f() = 10
+                $f()
+            end
+        end
+    end
+
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    Allocs.@profile sample_rate=1 do_work()
+    curr_allocs = length(Allocs.fetch().allocs)
+    @test curr_allocs > 10
+
+    # Do _more_ work, adding into the same profile
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > curr_allocs
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # Clear without fetching
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # And things still work like normal afterwards
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+end
+
+@testset "alloc profiler catches strings" begin
+    Allocs.@profile sample_rate=1 "$(rand())"
+
+    prof = Allocs.fetch()
+    Allocs.clear()
+
+    @test length(prof.allocs) >= 1
+    @test length([a for a in prof.allocs if a.type == String]) >= 1
+end