Skip to content

Commit e3ae520

Browse files
committed
Fixing node.
1 parent 7c952b8 commit e3ae520

File tree

9 files changed

+104
-6910
lines changed

9 files changed

+104
-6910
lines changed

bindings/node/index.d.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@ export function ctcDecoder(
1111
cleanup?: boolean | undefined | null,
1212
): Decoder
1313
export function fuseDecoder(): Decoder
14-
export function metaspaceDecoder(replacement?: string = '▁', addPrefixSpace?: bool = true): Decoder
14+
export function metaspaceDecoder(
15+
replacement?: string = '▁',
16+
prependScheme?: prepend_scheme = 'always',
17+
split?: split = true,
18+
): Decoder
1519
export function replaceDecoder(pattern: string, content: string): Decoder
1620
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
1721
export function stripDecoder(content: string, left: number, right: number): Decoder
@@ -89,7 +93,11 @@ export function byteLevelAlphabet(): Array<string>
8993
export function whitespacePreTokenizer(): PreTokenizer
9094
export function whitespaceSplitPreTokenizer(): PreTokenizer
9195
export function bertPreTokenizer(): PreTokenizer
92-
export function metaspacePreTokenizer(replacement?: string = '▁', addPrefixSpace?: bool = true): PreTokenizer
96+
export function metaspacePreTokenizer(
97+
replacement?: string = '▁',
98+
prependScheme?: prepend_scheme = 'always',
99+
split?: split = true,
100+
): PreTokenizer
93101
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
94102
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
95103
export function sequencePreTokenizer(preTokenizers: Array<PreTokenizer>): PreTokenizer

bindings/node/index.js

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,43 @@ switch (platform) {
219219
loadError = e
220220
}
221221
break
222+
case 'riscv64':
223+
if (isMusl()) {
224+
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-musl.node'))
225+
try {
226+
if (localFileExisted) {
227+
nativeBinding = require('./tokenizers.linux-riscv64-musl.node')
228+
} else {
229+
nativeBinding = require('tokenizers-linux-riscv64-musl')
230+
}
231+
} catch (e) {
232+
loadError = e
233+
}
234+
} else {
235+
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-gnu.node'))
236+
try {
237+
if (localFileExisted) {
238+
nativeBinding = require('./tokenizers.linux-riscv64-gnu.node')
239+
} else {
240+
nativeBinding = require('tokenizers-linux-riscv64-gnu')
241+
}
242+
} catch (e) {
243+
loadError = e
244+
}
245+
}
246+
break
247+
case 's390x':
248+
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-s390x-gnu.node'))
249+
try {
250+
if (localFileExisted) {
251+
nativeBinding = require('./tokenizers.linux-s390x-gnu.node')
252+
} else {
253+
nativeBinding = require('tokenizers-linux-s390x-gnu')
254+
}
255+
} catch (e) {
256+
loadError = e
257+
}
258+
break
222259
default:
223260
throw new Error(`Unsupported architecture on Linux: ${arch}`)
224261
}

bindings/node/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "tokenizers",
3-
"version": "0.14.0-dev0",
3+
"version": "0.15.3-dev0",
44
"repository": {
55
"type": "git",
66
"url": "git+https://github.com/huggingface/tokenizers.git"

bindings/node/src/decoders.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,19 +90,29 @@ pub fn fuse_decoder() -> Decoder {
9090
#[napi]
9191
pub fn metaspace_decoder(
9292
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
93-
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
93+
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
94+
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
9495
) -> Result<Decoder> {
95-
let add_prefix_space = add_prefix_space.unwrap_or(true);
96+
use tk::pre_tokenizers::metaspace::PrependScheme;
97+
let split = split.unwrap_or(true);
9698
let replacement = replacement.unwrap_or("▁".to_string());
9799
if replacement.chars().count() != 1 {
98100
return Err(Error::from_reason(
99101
"replacement is supposed to be a single char",
100102
));
101103
}
102104
let replacement = replacement.chars().next().unwrap();
105+
let prepend_scheme: PrependScheme = match prepend_scheme.unwrap_or(String::from("always")).as_str(){
106+
"always" => PrependScheme::Always,
107+
"first" => PrependScheme::First,
108+
"never" => PrependScheme::Never,
109+
_ => {return Err(Error::from_reason(
110+
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
111+
));}
112+
};
103113
Ok(Decoder {
104114
decoder: Some(Arc::new(RwLock::new(
105-
tk::decoders::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
115+
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
106116
))),
107117
})
108118
}

bindings/node/src/pre_tokenizers.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,20 +155,30 @@ pub fn bert_pre_tokenizer() -> PreTokenizer {
155155
#[napi]
156156
pub fn metaspace_pre_tokenizer(
157157
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
158-
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
158+
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
159+
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
159160
) -> Result<PreTokenizer> {
160-
let add_prefix_space = add_prefix_space.unwrap_or(true);
161+
use tk::pre_tokenizers::metaspace::PrependScheme;
162+
let split = split.unwrap_or(true);
161163
let replacement = replacement.unwrap_or("▁".to_string());
162164
if replacement.chars().count() != 1 {
163165
return Err(Error::from_reason(
164166
"replacement is supposed to be a single char",
165167
));
166168
}
167169
let replacement = replacement.chars().next().unwrap();
170+
let prepend_scheme: PrependScheme = match prepend_scheme.unwrap_or(String::from("always")).as_str(){
171+
"always" => PrependScheme::Always,
172+
"first" => PrependScheme::First,
173+
"never" => PrependScheme::Never,
174+
_ => {return Err(Error::from_reason(
175+
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
176+
));}
177+
};
168178

169179
Ok(PreTokenizer {
170180
pretok: Some(Arc::new(RwLock::new(
171-
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
181+
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
172182
))),
173183
})
174184
}

0 commit comments

Comments
 (0)