Skip to content

Commit fffe16e

Browse files
ngxsoncoyotte508
andauthored
Hub: Move hash-wasm into vendor (emscripten) (#682)
Resolves #221 In this PR, I moved source code of `hash-wasm` into vendor directory (c source code), also add compile script using emscripten. As a bonus, SIMD is also enabled, which should allow better performance. I also added `sha256-wrapper.ts` as a thin wrapper for the compiled wasm module. - `createSHA256` create the wasm module, works just like before. - `createSHA256WorkerCode` this function use `Function.toString()` trick to copy the module code into worker. This trick is already used in one of my project, [wllama](https://github.com/ngxson/wllama), so it should work on Firefox/Chrome/Safari (but feel free to test it further) --------- Co-authored-by: Eliott C <[email protected]>
1 parent 6396f72 commit fffe16e

File tree

11 files changed

+1282
-39
lines changed

11 files changed

+1282
-39
lines changed

packages/hub/.eslintignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
dist
2+
sha256.js

packages/hub/.prettierignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
pnpm-lock.yaml
22
# In order to avoid code samples to have tabs, they don't display well on npm
33
README.md
4-
dist
4+
dist
5+
sha256.js

packages/hub/package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
"@types/node": "^20.11.28"
6060
},
6161
"dependencies": {
62-
"@huggingface/tasks": "workspace:^",
63-
"hash-wasm": "^4.9.0"
62+
"@huggingface/tasks": "workspace:^"
6463
}
6564
}

packages/hub/pnpm-lock.yaml

Lines changed: 0 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/hub/src/utils/sha256.spec.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { describe, it, expect } from "vitest";
2+
import { sha256 } from "./sha256";
3+
4+
const smallContent = "hello world";
5+
const smallContentSHA256 = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9";
6+
const bigContent = "O123456789".repeat(100_000);
7+
const bigContentSHA256 = "a3bbce7ee1df7233d85b5f4d60faa3755f93f537804f8b540c72b0739239ddf8";
8+
const biggerContent = "0123456789".repeat(1_000_000);
9+
const biggerContentSHA256 = "d52fcc26b48dbd4d79b125eb0a29b803ade07613c67ac7c6f2751aefef008486";
10+
11+
describe("sha256", () => {
12+
async function calcSHA256(content: string, useWebWorker: boolean) {
13+
const iterator = sha256(new Blob([content]), { useWebWorker });
14+
let res: IteratorResult<number, string>;
15+
do {
16+
res = await iterator.next();
17+
} while (!res.done);
18+
return res.value;
19+
}
20+
21+
it("Calculate hash of a small file", async () => {
22+
const sha = await calcSHA256(smallContent, false);
23+
expect(sha).toBe(smallContentSHA256);
24+
});
25+
26+
it("Calculate hash of a big file", async () => {
27+
const sha = await calcSHA256(bigContent, false);
28+
expect(sha).toBe(bigContentSHA256);
29+
});
30+
31+
it("Calculate hash of a bigger file", async () => {
32+
const sha = await calcSHA256(biggerContent, false);
33+
expect(sha).toBe(biggerContentSHA256);
34+
});
35+
36+
it("Calculate hash of a small file (+ web worker)", async () => {
37+
const sha = await calcSHA256(smallContent, true);
38+
expect(sha).toBe(smallContentSHA256);
39+
});
40+
41+
it("Calculate hash of a big file (+ web worker)", async () => {
42+
const sha = await calcSHA256(bigContent, true);
43+
expect(sha).toBe(bigContentSHA256);
44+
});
45+
46+
it("Calculate hash of a bigger file (+ web worker)", async () => {
47+
const sha = await calcSHA256(biggerContent, true);
48+
expect(sha).toBe(biggerContentSHA256);
49+
});
50+
});

packages/hub/src/utils/sha256.ts

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,10 @@ import { eventToGenerator } from "./eventToGenerator";
22
import { hexFromBytes } from "./hexFromBytes";
33
import { isFrontend } from "./isFrontend";
44

5-
const webWorkerCode = `
6-
// Would prefer no CDN, but need a clever way to not burden the main file of the bundle
7-
importScripts("https://cdn.jsdelivr.net/npm/hash-wasm@4/dist/sha256.umd.min.js");
8-
9-
const createSHA256 = hashwasm.createSHA256;
10-
11-
self.addEventListener('message', async (event) => {
12-
const { file } = event.data;
13-
const sha256 = await createSHA256();
14-
sha256.init();
15-
const reader = file.stream().getReader();
16-
const total = file.size;
17-
let bytesDone = 0;
18-
while (true) {
19-
const { done, value } = await reader.read();
20-
if (done) {
21-
break;
22-
}
23-
sha256.update(value);
24-
bytesDone += value.length;
25-
postMessage({ progress: bytesDone / total });
26-
}
27-
postMessage({ sha256: sha256.digest('hex') });
28-
});
29-
`;
5+
async function getWebWorkerCode() {
6+
const sha256Module = await import("../vendor/hash-wasm/sha256-wrapper");
7+
return URL.createObjectURL(new Blob([sha256Module.createSHA256WorkerCode()]));
8+
}
309

3110
const pendingWorkers: Worker[] = [];
3211
const runningWorkers: Set<Worker> = new Set();
@@ -45,7 +24,7 @@ async function getWorker(poolSize?: number): Promise<Worker> {
4524
}
4625
}
4726
if (!poolSize) {
48-
const worker = new Worker(URL.createObjectURL(new Blob([webWorkerCode])));
27+
const worker = new Worker(await getWebWorkerCode());
4928
runningWorkers.add(worker);
5029
return worker;
5130
}
@@ -58,7 +37,7 @@ async function getWorker(poolSize?: number): Promise<Worker> {
5837
await waitPromise;
5938
}
6039

61-
const worker = new Worker(URL.createObjectURL(new Blob([webWorkerCode])));
40+
const worker = new Worker(await getWebWorkerCode());
6241
runningWorkers.add(worker);
6342
return worker;
6443
}
@@ -147,7 +126,7 @@ export async function* sha256(
147126
}
148127
}
149128
if (!wasmModule) {
150-
wasmModule = await import("hash-wasm");
129+
wasmModule = await import("../vendor/hash-wasm/sha256-wrapper");
151130
}
152131

153132
const sha256 = await wasmModule.createSHA256();
@@ -184,4 +163,4 @@ export async function* sha256(
184163
// eslint-disable-next-line @typescript-eslint/consistent-type-imports
185164
let cryptoModule: typeof import("./sha256-node");
186165
// eslint-disable-next-line @typescript-eslint/consistent-type-imports
187-
let wasmModule: typeof import("hash-wasm");
166+
let wasmModule: typeof import("../vendor/hash-wasm/sha256-wrapper");
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
3+
CURRENT_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4+
cd $CURRENT_PATH
5+
6+
# Clean up
7+
docker kill hash-wasm-builder
8+
docker rm hash-wasm-builder
9+
10+
# Start container
11+
docker run -it -d --name hash-wasm-builder emscripten/emsdk:3.1.55 bash
12+
13+
# Copy & compile
14+
docker exec hash-wasm-builder bash -c "mkdir /source"
15+
docker cp ./sha256.c hash-wasm-builder:/source
16+
docker exec hash-wasm-builder bash -c "\
17+
cd /source && \
18+
emcc sha256.c -o sha256.js -msimd128 -sSINGLE_FILE -sMODULARIZE=1 -sENVIRONMENT=web,worker -sEXPORTED_FUNCTIONS=_Hash_Init,_Hash_Update,_Hash_Final,_GetBufferPtr -sFILESYSTEM=0 -fno-rtti -fno-exceptions -O1 -sMODULARIZE=1 -sEXPORT_ES6=1 \
19+
"
20+
# Patch "_scriptDir" variable
21+
docker exec hash-wasm-builder bash -c "\
22+
cd /source && \
23+
sed -i 's\var _scriptDir\var _unused\g' ./sha256.js && \
24+
sed -i 's\_scriptDir\false\g' ./sha256.js \
25+
"
26+
27+
# Copy back compiled file
28+
docker cp hash-wasm-builder:/source/sha256.js .
29+
30+
31+
# Clean up
32+
docker kill hash-wasm-builder
33+
docker rm hash-wasm-builder
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import WasmModule from "./sha256";
2+
3+
export async function createSHA256(isInsideWorker = false): Promise<{
4+
init(): void;
5+
update(data: Uint8Array): void;
6+
digest(method: "hex"): string;
7+
}> {
8+
const BUFFER_MAX_SIZE = 8 * 1024 * 1024;
9+
const wasm: Awaited<ReturnType<typeof WasmModule>> = isInsideWorker
10+
? // @ts-expect-error WasmModule will be populated inside self object
11+
await self["SHA256WasmModule"]()
12+
: await WasmModule();
13+
const heap = wasm.HEAPU8.subarray(wasm._GetBufferPtr());
14+
return {
15+
init() {
16+
wasm._Hash_Init(256);
17+
},
18+
update(data: Uint8Array) {
19+
let byteUsed = 0;
20+
while (byteUsed < data.byteLength) {
21+
const bytesLeft = data.byteLength - byteUsed;
22+
const length = Math.min(bytesLeft, BUFFER_MAX_SIZE);
23+
heap.set(data.subarray(byteUsed, byteUsed + length));
24+
wasm._Hash_Update(length);
25+
byteUsed += length;
26+
}
27+
},
28+
digest(method: "hex") {
29+
if (method !== "hex") {
30+
throw new Error("Only digest hex is supported");
31+
}
32+
wasm._Hash_Final();
33+
const result = Array.from(heap.slice(0, 32));
34+
return result.map((b) => b.toString(16).padStart(2, "0")).join("");
35+
},
36+
};
37+
}
38+
39+
export function createSHA256WorkerCode(): string {
40+
return `
41+
self.addEventListener('message', async (event) => {
42+
const { file } = event.data;
43+
const sha256 = await self.createSHA256(true);
44+
sha256.init();
45+
const reader = file.stream().getReader();
46+
const total = file.size;
47+
let bytesDone = 0;
48+
while (true) {
49+
const { done, value } = await reader.read();
50+
if (done) {
51+
break;
52+
}
53+
sha256.update(value);
54+
bytesDone += value.length;
55+
postMessage({ progress: bytesDone / total });
56+
}
57+
postMessage({ sha256: sha256.digest('hex') });
58+
});
59+
self.SHA256WasmModule = ${WasmModule.toString()};
60+
self.createSHA256 = ${createSHA256.toString()};
61+
`;
62+
}

0 commit comments

Comments
 (0)