Skip to content

Commit be50555

Browse files
[CAS] Add a new API in ObjectStore to import a CAS tree
Add a new API to ObjectStore that can import a cas tree from another CAS. The two ObjectStores don't have to share the same hashing algorithm since all the objects will be rehashed and inserted into the new database. As part of the better testing support, the test plugin CAS library now uses SHA1 hashing which is different from default BLAKE3 hasher as builtin CAS. The test plugin library can be used to test interaction of CAS of different schemas. (cherry picked from commit 5450d4f)
1 parent 5246b24 commit be50555

File tree

6 files changed

+163
-30
lines changed

6 files changed

+163
-30
lines changed

clang/test/CAS/print-compile-job-cache-key.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,5 +68,5 @@
6868
// RUN: cat %t/output-plugin.txt | sed \
6969
// RUN: -e "s/^.*miss for '//" \
7070
// RUN: -e "s/' .*$//" > %t/cache-key-plugin
71-
// RUN: clang-cas-test -print-compile-job-cache-key -cas %t/cas @%t/cache-key-plugin \
71+
// RUN: clang-cas-test -print-compile-job-cache-key -cas %t/cas-plugin @%t/cache-key-plugin \
7272
// RUN: -fcas-plugin-path %llvmshlibdir/libCASPluginTest%pluginext | FileCheck %s

llvm/include/llvm/CAS/ObjectStore.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,10 @@ class ObjectStore {
309309
/// Validate the whole node tree.
310310
Error validateTree(ObjectRef Ref);
311311

312+
/// Import object from another CAS. This will import the full tree from the
313+
/// other CAS.
314+
Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
315+
312316
/// Print the ObjectStore internals for debugging purpose.
313317
virtual void print(raw_ostream &) const {}
314318
void dump() const;

llvm/lib/CAS/ObjectStore.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "llvm/Support/FileSystem.h"
1818
#include "llvm/Support/ManagedStatic.h"
1919
#include "llvm/Support/SmallVectorMemoryBuffer.h"
20+
#include <deque>
2021

2122
using namespace llvm;
2223
using namespace llvm::cas;
@@ -217,6 +218,92 @@ Error ObjectStore::validateTree(ObjectRef Root) {
217218
return Error::success();
218219
}
219220

221+
Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
222+
ObjectRef Other) {
223+
// Copy the full CAS tree from upstream with depth-first ordering to ensure
224+
// all the child nodes are available in downstream CAS before inserting
225+
// current object. This uses a similar algorithm as
226+
// `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema
227+
// so it can be used to import from any other ObjectStore reguardless of the
228+
// CAS schema.
229+
230+
// There is no work to do if importing from self.
231+
if (this == &Upstream)
232+
return Other;
233+
234+
/// Keeps track of the state of visitation for current node and all of its
235+
/// parents. Upstream Cursor holds information only from upstream CAS.
236+
struct UpstreamCursor {
237+
ObjectRef Ref;
238+
ObjectHandle Node;
239+
size_t RefsCount;
240+
std::deque<ObjectRef> Refs;
241+
};
242+
SmallVector<UpstreamCursor, 16> CursorStack;
243+
/// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either
244+
/// just stored in the CAS or nodes already exists in the current CAS.
245+
SmallVector<ObjectRef, 128> PrimaryRefStack;
246+
/// A map from upstream ObjectRef to current ObjectRef.
247+
llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects;
248+
249+
auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) {
250+
unsigned NumRefs = Upstream.getNumRefs(Node);
251+
std::deque<ObjectRef> Refs;
252+
for (unsigned I = 0; I < NumRefs; ++I)
253+
Refs.push_back(Upstream.readRef(Node, I));
254+
255+
CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)});
256+
};
257+
258+
auto UpstreamHandle = Upstream.load(Other);
259+
if (!UpstreamHandle)
260+
return UpstreamHandle.takeError();
261+
enqueueNode(Other, *UpstreamHandle);
262+
263+
while (!CursorStack.empty()) {
264+
UpstreamCursor &Cur = CursorStack.back();
265+
if (Cur.Refs.empty()) {
266+
// Copy the node data into the primary store.
267+
// The bottom of \p PrimaryRefStack contains the ObjectRef for the
268+
// current node.
269+
assert(PrimaryRefStack.size() >= Cur.RefsCount);
270+
auto Refs = ArrayRef(PrimaryRefStack)
271+
.slice(PrimaryRefStack.size() - Cur.RefsCount);
272+
auto NewNode = store(Refs, Upstream.getData(Cur.Node));
273+
if (!NewNode)
274+
return NewNode.takeError();
275+
276+
// Remove the current node and its IDs from the stack.
277+
PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
278+
CursorStack.pop_back();
279+
280+
PrimaryRefStack.push_back(*NewNode);
281+
CreatedObjects.try_emplace(Cur.Ref, *NewNode);
282+
continue;
283+
}
284+
285+
// Check if the node exists already.
286+
auto CurrentID = Cur.Refs.front();
287+
Cur.Refs.pop_front();
288+
auto Ref = CreatedObjects.find(CurrentID);
289+
if (Ref != CreatedObjects.end()) {
290+
// If exists already, just need to enqueue the primary node.
291+
PrimaryRefStack.push_back(Ref->second);
292+
continue;
293+
}
294+
295+
// Load child.
296+
auto PrimaryID = Upstream.load(CurrentID);
297+
if (LLVM_UNLIKELY(!PrimaryID))
298+
return PrimaryID.takeError();
299+
300+
enqueueNode(CurrentID, *PrimaryID);
301+
}
302+
303+
assert(PrimaryRefStack.size() == 1);
304+
return PrimaryRefStack.front();
305+
}
306+
220307
std::unique_ptr<MemoryBuffer>
221308
ObjectProxy::getMemoryBuffer(StringRef Name,
222309
bool RequiresNullTerminator) const {

llvm/test/tools/llvm-cas/ingest.test

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,7 @@ CHECK-ERROR: llvm-cas: get-cas-id: No such file or directory
3636
RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/cas.id 2>&1 | FileCheck %s --check-prefix=CHECK-NODE-REFS
3737
CHECK-NODE-REFS: llvmcas://
3838
CHECK-NODE-REFS: llvmcas://
39+
40+
// Test exporting the entire tree.
41+
RUN: llvm-cas --cas %t/new-cas --fcas-plugin-path %llvmshlibdir/libCASPluginTest%pluginext --upstream-cas %t/cas --import @%t/cas.id > %t/plugin.id
42+
RUN: llvm-cas --cas %t/new-cas --fcas-plugin-path %llvmshlibdir/libCASPluginTest%pluginext --ls-tree-recursive @%t/plugin.id | FileCheck %s

llvm/tools/libCASPluginTest/libCASPluginTest.cpp

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "llvm-c/CAS/PluginAPI_functions.h"
14-
#include "llvm/CAS/BuiltinCASContext.h"
1514
#include "llvm/CAS/BuiltinObjectHasher.h"
15+
#include "llvm/CAS/CASID.h"
1616
#include "llvm/CAS/UnifiedOnDiskCache.h"
1717
#include "llvm/Support/CBindingWrapping.h"
1818
#include "llvm/Support/Errc.h"
1919
#include "llvm/Support/Error.h"
2020
#include "llvm/Support/ThreadPool.h"
21+
#include "llvm/Support/SHA1.h"
2122

2223
using namespace llvm;
2324
using namespace llvm::cas;
24-
using namespace llvm::cas::builtin;
2525
using namespace llvm::cas::ondisk;
2626

2727
static char *copyNewMallocString(StringRef Str) {
@@ -125,6 +125,54 @@ bool llcas_cas_options_set_option(llcas_cas_options_t c_opts, const char *name,
125125

126126
namespace {
127127

128+
using HasherT = SHA1;
129+
using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
130+
131+
class PluginCASContext : public CASContext {
132+
void printIDImpl(raw_ostream &OS, const CASID &ID) const final {
133+
PluginCASContext::printID(ID.getHash(), OS);
134+
}
135+
136+
public:
137+
static StringRef getHashName() { return "SHA1"; }
138+
StringRef getHashSchemaIdentifier() const final {
139+
static const std::string ID =
140+
("llvm.cas.builtin.v2[" + getHashName() + "]").str();
141+
return ID;
142+
}
143+
144+
PluginCASContext() = default;
145+
146+
static Expected<HashType> parseID(StringRef Reference) {
147+
if (!Reference.consume_front("llvmcas://"))
148+
return createStringError(
149+
std::make_error_code(std::errc::invalid_argument),
150+
"invalid cas-id '" + Reference + "'");
151+
152+
if (Reference.size() != 2 * sizeof(HashType))
153+
return createStringError(
154+
std::make_error_code(std::errc::invalid_argument),
155+
"wrong size for cas-id hash '" + Reference + "'");
156+
157+
std::string Binary;
158+
if (!tryGetFromHex(Reference, Binary))
159+
return createStringError(
160+
std::make_error_code(std::errc::invalid_argument),
161+
"invalid hash in cas-id '" + Reference + "'");
162+
163+
assert(Binary.size() == sizeof(HashType));
164+
HashType Digest;
165+
llvm::copy(Binary, Digest.data());
166+
return Digest;
167+
}
168+
169+
static void printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) {
170+
SmallString<64> Hash;
171+
toHex(Digest, /*LowerCase=*/true, Hash);
172+
OS << "llvmcas://" << Hash;
173+
}
174+
};
175+
128176
struct CASWrapper {
129177
std::string FirstPrefix;
130178
std::string SecondPrefix;
@@ -308,15 +356,15 @@ llcas_cas_t llcas_cas_create(llcas_cas_options_t c_opts, char **error) {
308356
auto &Opts = *unwrap(c_opts);
309357
Expected<std::unique_ptr<UnifiedOnDiskCache>> DB = UnifiedOnDiskCache::open(
310358
Opts.OnDiskPath, /*SizeLimit=*/std::nullopt,
311-
BuiltinCASContext::getHashName(), sizeof(HashType));
359+
PluginCASContext::getHashName(), sizeof(HashType));
312360
if (!DB)
313361
return reportError<llcas_cas_t>(DB.takeError(), error);
314362

315363
std::unique_ptr<UnifiedOnDiskCache> UpstreamDB;
316364
if (!Opts.UpstreamPath.empty()) {
317365
if (Error E = UnifiedOnDiskCache::open(
318366
Opts.UpstreamPath, /*SizeLimit=*/std::nullopt,
319-
BuiltinCASContext::getHashName(), sizeof(HashType))
367+
PluginCASContext::getHashName(), sizeof(HashType))
320368
.moveInto(UpstreamDB))
321369
return reportError<llcas_cas_t>(std::move(E), error);
322370
}
@@ -380,7 +428,7 @@ unsigned llcas_digest_parse(llcas_cas_t c_cas, const char *printed_digest,
380428
assert(Consumed);
381429
(void)Consumed;
382430

383-
Expected<HashType> Digest = BuiltinCASContext::parseID(PrintedDigest);
431+
Expected<HashType> Digest = PluginCASContext::parseID(PrintedDigest);
384432
if (!Digest)
385433
return reportError(Digest.takeError(), error, 0);
386434
std::uninitialized_copy(Digest->begin(), Digest->end(), bytes);
@@ -394,7 +442,7 @@ bool llcas_digest_print(llcas_cas_t c_cas, llcas_digest_t c_digest,
394442
raw_svector_ostream OS(PrintDigest);
395443
// Include these for testing purposes.
396444
OS << Wrapper.FirstPrefix << Wrapper.SecondPrefix;
397-
BuiltinCASContext::printID(ArrayRef(c_digest.data, c_digest.size), OS);
445+
PluginCASContext::printID(ArrayRef(c_digest.data, c_digest.size), OS);
398446
*printed_id = copyNewMallocString(PrintDigest);
399447
return false;
400448
}

llvm/tools/llvm-cas/llvm-cas.cpp

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,8 @@ int main(int Argc, char **Argv) {
241241
if (!UpstreamCAS)
242242
ExitOnErr(createStringError(inconvertibleErrorCode(),
243243
"missing '-upstream-cas'"));
244-
return import(*CAS, *UpstreamCAS, Inputs);
244+
245+
return import(*UpstreamCAS, *CAS, Inputs);
245246
}
246247

247248
if (Command == PutCacheKey || Command == GetCacheResult) {
@@ -641,32 +642,21 @@ int getCASIDForFile(ObjectStore &CAS, const CASID &ID,
641642
return 0;
642643
}
643644

644-
static ObjectRef importNode(ObjectStore &CAS, ObjectStore &UpstreamCAS,
645-
const CASID &ID) {
646-
ExitOnError ExitOnErr("llvm-cas: import: ");
647-
648-
std::optional<ObjectRef> PrimaryRef = CAS.getReference(ID);
649-
if (PrimaryRef)
650-
return *PrimaryRef; // object is present.
651-
652-
ObjectProxy UpstreamObj = ExitOnErr(UpstreamCAS.getProxy(ID));
653-
SmallVector<ObjectRef> Refs;
654-
ExitOnErr(UpstreamObj.forEachReference([&](ObjectRef UpstreamRef) -> Error {
655-
ObjectRef Ref =
656-
importNode(CAS, UpstreamCAS, UpstreamCAS.getID(UpstreamRef));
657-
Refs.push_back(Ref);
658-
return Error::success();
659-
}));
660-
return ExitOnErr(CAS.storeFromString(Refs, UpstreamObj.getData()));
661-
}
662-
663-
static int import(ObjectStore &CAS, ObjectStore &UpstreamCAS,
645+
static int import(ObjectStore &FromCAS, ObjectStore &ToCAS,
664646
ArrayRef<std::string> Objects) {
665647
ExitOnError ExitOnErr("llvm-cas: import: ");
666648

667649
for (StringRef Object : Objects) {
668-
CASID ID = ExitOnErr(CAS.parseID(Object));
669-
importNode(CAS, UpstreamCAS, ID);
650+
CASID ID = ExitOnErr(FromCAS.parseID(Object));
651+
auto Ref = FromCAS.getReference(ID);
652+
if (!Ref) {
653+
ExitOnErr(createStringError(inconvertibleErrorCode(),
654+
"input not found: " + ID.toString()));
655+
return 1;
656+
}
657+
658+
auto Imported = ExitOnErr(ToCAS.importObject(FromCAS, *Ref));
659+
llvm::outs() << ToCAS.getID(Imported).toString() << "\n";
670660
}
671661
return 0;
672662
}

0 commit comments

Comments
 (0)