Skip to content

Commit d054074

Browse files
jsl-modelsahmedlone127maziyarpanahi
authored
2023-06-21-bert_embeddings_distil_clinical_en (#13861)
* Add model 2023-06-21-bert_embeddings_distil_clinical_en * Add model 2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en * Add model 2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en * Add model 2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it * Add model 2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en * Add model 2023-06-21-bert_embeddings_legalbert_adept_en * Add model 2023-06-21-bert_embeddings_base_uncased_issues_128_en * Add model 2023-06-21-bert_embeddings_pretrain_ko * Add model 2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en * Add model 2023-06-21-legalectra_small_es * Add model 2023-06-21-biobert_pubmed_base_cased_v1.2_en * Add model 2023-06-21-bert_embeddings_jobbert_base_cased_en * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de * Add model 2023-06-21-legalectra_base_es * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de * Add model 2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en * Add model 2023-06-21-bert_embeddings_InLegalBERT_en * Add model 2023-06-21-bert_embeddings_InCaseLawBERT_en * Add model 2023-06-21-bert_base_uncased_contracts_en * Add model 2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_generator_en * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it * Add model 2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de * Add model 2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko * Add model 2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_small_generator_en * Add model 2023-06-21-electra_embeddings_electra_large_generator_en * Add model 2023-06-21-electra_embeddings_electricidad_base_generator_es * Add model 2023-06-21-electra_embeddings_gelectra_large_generator_de * Add model 2023-06-21-electra_embeddings_koelectra_base_generator_ko * Add model 2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de * Add model 2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl * Add model 2023-06-21-electra_embeddings_gelectra_base_generator_de * Add model 2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl * Add model 2023-06-21-bert_sentence_embeddings_financial_de * Add model 2023-06-21-electra_embeddings_electra_small_japanese_generator_ja * Add model 2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl * Add model 2023-06-21-electra_embeddings_koelectra_small_generator_ko * Add model 2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko * Add model 2023-06-21-bert_embeddings_sec_bert_base_en * Add model 2023-06-21-electra_embeddings_kr_electra_generator_ko * Add model 2023-06-21-bert_embeddings_sec_bert_sh_en * Add model 2023-06-21-bert_embeddings_german_financial_statements_bert_de * Add model 2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl * Add model 2023-06-21-bert_embeddings_javanese_bert_small_jv * Add model 2023-06-21-bert_embeddings_finest_bert_en * Add model 2023-06-21-bert_embeddings_indic_transformers_te_bert_te * Add model 2023-06-21-bert_embeddings_gbert_base_de * Add model 2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi * Add model 2023-06-21-bert_embeddings_hateBERT_en * Add model 2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en * Add model 2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en * Add model 2023-06-21-bert_embeddings_indic_transformers_te_bert_te * Add model 2023-06-21-bert_embeddings_hseBert_it_cased_it * Add model 2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en * Add model 2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es * Add model 2023-06-21-bert_embeddings_dziribert_ar * Add model 2023-06-21-bert_embeddings_deberta_base_uncased_en * Add model 2023-06-21-bert_embeddings_dbert_ko * Add model 2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv * Add model 2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es * Add model 2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es * Add model 2023-06-21-bert_embeddings_crosloengual_bert_en * Add model 2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en * Add model 2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es * Add model 2023-06-21-bert_embeddings_legal_bert_base_uncased_en * Add model 2023-06-21-biobert_embeddings_all_pt * Add model 2023-06-21-bert_embeddings_wineberto_italian_cased_it * Add model 2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en * Add model 2023-06-21-biobert_embeddings_clinical_pt * Add model 2023-06-21-bert_embeddings_telugu_bertu_te * Add model 2023-06-21-bert_embeddings_wobert_chinese_plus_zh * Add model 2023-06-21-bert_embeddings_wineberto_italian_cased_it * Add model 2023-06-21-bert_embeddings_sikuroberta_zh * Add model 2023-06-21-biobert_embeddings_biomedical_pt * Add model 2023-06-21-bert_embeddings_sikubert_zh * Add model 2023-06-21-bert_embeddings_psych_search_en * Add model 2023-06-21-bert_embeddings_marathi_bert_mr * Add model 2023-06-21-bert_embeddings_netbert_en * Add model 2023-06-21-bert_embeddings_mbert_ar_c19_ar * Add model 2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar * Add model 2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en * Add model 2023-06-21-bert_embeddings_MARBERTv2_ar * Add model 2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar * Add model 2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar * Add model 2023-06-21-bert_embeddings_bert_base_german_uncased_de * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar * Add model 2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de * Add model 2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it * Add model 2023-06-21-bert_embeddings_bert_base_arabertv2_ar * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar * Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar * Add model 2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it * Add model 2023-06-21-bert_embeddings_bert_base_gl_cased_pt * Add model 2023-06-21-bert_embeddings_MARBERT_ar * Add model 2023-06-21-bert_embeddings_AraBertMo_base_V1_ar * Add model 2023-06-21-bert_embeddings_bert_base_arabic_ar * Add model 2023-06-21-bert_embeddings_DarijaBERT_ar * Add model 2023-06-21-bert_embeddings_Ara_DialectBERT_ar * Add model 2023-06-21-bert_embeddings_German_MedBERT_de * Add model 2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar * Add model 2023-06-21-bert_embeddings_FinancialBERT_en * Add model 2023-06-21-bert_embeddings_ARBERT_ar * Add model 2023-06-21-bert_embeddings_COVID_SciBERT_en * Add model 2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es * Add model 2023-06-21-bert_embeddings_agriculture_bert_uncased_en * Add model 2023-06-21-bert_embeddings_bangla_bert_bn * Add model 2023-06-21-bert_embeddings_bert_kor_base_ko * Add model 2023-06-21-bert_embeddings_bert_base_arabertv02_ar * Add model 2023-06-21-bert_embeddings_arabert_c19_ar * Add model 2023-06-21-bert_embeddings_bert_base_5lang_cased_es * Add model 2023-06-21-bert_embeddings_bert_base_arabertv01_ar * Add model 2023-06-21-bert_embeddings_bangla_bert_base_bn * Add model 2023-06-21-bert_embeddings_bert_medium_arabic_ar * Add model 2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en * Add model 2023-06-21-bert_embeddings_bert_mini_arabic_ar * Add model 2023-06-21-bert_embeddings_bert_base_arabert_ar * Add model 2023-06-21-bert_embeddings_beto_gn_base_cased_es * Add model 2023-06-21-bert_embeddings_chemical_bert_uncased_en * Add model 2023-06-21-bert_embeddings_bert_base_ko * Add model 2023-06-21-bert_embeddings_chefberto_italian_cased_it * Add model 2023-06-21-bert_embeddings_childes_bert_en * Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt * Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt * Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt * Add model 2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar * Add model 2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en * Add model 2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en * Add model 2023-06-21-bert_embeddings_bert_base_qarib_ar * Add model 2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en * Add model 2023-06-21-ms_bluebert_base_uncased_en * Add model 2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar * fixing wrong spark version and removing tensorflow --------- Co-authored-by: ahmedlone127 <[email protected]> Co-authored-by: MaziyarPanahi <[email protected]>
1 parent 02a9afb commit d054074

File tree

142 files changed

+20910
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+20910
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
---
2+
layout: model
3+
title: English Legal Contracts BertEmbeddings model (Base, Uncased)
4+
author: John Snow Labs
5+
name: bert_base_uncased_contracts
6+
date: 2023-06-21
7+
tags: [open_source, bert, embeddings, finance, contracts, en, onnx]
8+
task: Embeddings
9+
language: en
10+
edition: Spark NLP 5.0.0
11+
spark_version: 3.0
12+
supported: true
13+
engine: onnx
14+
annotator: BertEmbeddings
15+
article_header:
16+
type: cover
17+
use_language_switcher: "Python-Scala-Java"
18+
---
19+
20+
## Description
21+
22+
Pretrained Word Embeddings model, trained on legal contracts, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-contracts` is a English model originally trained by `nlpaueb`.
23+
24+
## Predicted Entities
25+
26+
27+
28+
{:.btn-box}
29+
<button class="button button-orange" disabled>Live Demo</button>
30+
<button class="button button-orange" disabled>Open in Colab</button>
31+
[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon}
32+
[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3}
33+
34+
## How to use
35+
36+
<div class="tabs-box" markdown="1">
37+
{% include programmingLanguageSelectScalaPythonNLU.html %}
38+
```python
39+
documentAssembler = DocumentAssembler() \
40+
.setInputCol("text") \
41+
.setOutputCol("document")
42+
43+
tokenizer = Tokenizer() \
44+
.setInputCols("document") \
45+
.setOutputCol("token")
46+
47+
embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \
48+
.setInputCols(["document", "token"]) \
49+
.setOutputCol("embeddings")
50+
51+
pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings])
52+
53+
data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text")
54+
55+
result = pipeline.fit(data).transform(data)
56+
```
57+
```scala
58+
val documentAssembler = new DocumentAssembler()
59+
.setInputCol("text")
60+
.setOutputCol("document")
61+
62+
val tokenizer = new Tokenizer()
63+
.setInputCols(Array("document"))
64+
.setOutputCol("token")
65+
66+
val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en")
67+
.setInputCols(Array("document", "token"))
68+
.setOutputCol("embeddings")
69+
70+
val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings))
71+
72+
val data = Seq("I love Spark NLP.").toDF("text")
73+
74+
val result = pipeline.fit(data).transform(data)
75+
```
76+
77+
78+
{:.nlu-block}
79+
```python
80+
import nlu
81+
nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""")
82+
```
83+
84+
</div>
85+
86+
{:.model-param}
87+
88+
<div class="tabs-box" markdown="1">
89+
{% include programmingLanguageSelectScalaPythonNLU.html %}
90+
```python
91+
documentAssembler = DocumentAssembler() \
92+
.setInputCol("text") \
93+
.setOutputCol("document")
94+
95+
tokenizer = Tokenizer() \
96+
.setInputCols("document") \
97+
.setOutputCol("token")
98+
99+
embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \
100+
.setInputCols(["document", "token"]) \
101+
.setOutputCol("embeddings")
102+
103+
pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings])
104+
105+
data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text")
106+
107+
result = pipeline.fit(data).transform(data)
108+
```
109+
```scala
110+
val documentAssembler = new DocumentAssembler()
111+
.setInputCol("text")
112+
.setOutputCol("document")
113+
114+
val tokenizer = new Tokenizer()
115+
.setInputCols(Array("document"))
116+
.setOutputCol("token")
117+
118+
val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en")
119+
.setInputCols(Array("document", "token"))
120+
.setOutputCol("embeddings")
121+
122+
val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings))
123+
124+
val data = Seq("I love Spark NLP.").toDF("text")
125+
126+
val result = pipeline.fit(data).transform(data)
127+
```
128+
129+
{:.nlu-block}
130+
```python
131+
import nlu
132+
nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""")
133+
```
134+
</div>
135+
136+
{:.model-param}
137+
## Model Information
138+
139+
{:.table-model}
140+
|---|---|
141+
|Model Name:|bert_base_uncased_contracts|
142+
|Compatibility:|Spark NLP 5.0.0+|
143+
|License:|Open Source|
144+
|Edition:|Official|
145+
|Input Labels:|[sentence, token]|
146+
|Output Labels:|[bert]|
147+
|Language:|en|
148+
|Size:|407.1 MB|
149+
|Case sensitive:|true|
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
---
2+
layout: model
3+
title: Arabic Bert Embeddings (ARBERT model)
4+
author: John Snow Labs
5+
name: bert_embeddings_ARBERT
6+
date: 2023-06-21
7+
tags: [bert, embeddings, ar, open_source, onnx]
8+
task: Embeddings
9+
language: ar
10+
edition: Spark NLP 5.0.0
11+
spark_version: 3.0
12+
supported: true
13+
engine: onnx
14+
annotator: BertEmbeddings
15+
article_header:
16+
type: cover
17+
use_language_switcher: "Python-Scala-Java"
18+
---
19+
20+
## Description
21+
22+
Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `ARBERT` is a Arabic model orginally trained by `UBC-NLP`.
23+
24+
## Predicted Entities
25+
26+
27+
28+
{:.btn-box}
29+
<button class="button button-orange" disabled>Live Demo</button>
30+
<button class="button button-orange" disabled>Open in Colab</button>
31+
[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.arr.button-icon}
32+
[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3}
33+
34+
## How to use
35+
36+
<div class="tabs-box" markdown="1">
37+
{% include programmingLanguageSelectScalaPythonNLU.html %}
38+
```python
39+
documentAssembler = DocumentAssembler() \
40+
.setInputCol("text") \
41+
.setOutputCol("document")
42+
43+
tokenizer = Tokenizer() \
44+
.setInputCols("document") \
45+
.setOutputCol("token")
46+
47+
embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \
48+
.setInputCols(["document", "token"]) \
49+
.setOutputCol("embeddings")
50+
51+
pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings])
52+
53+
data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text")
54+
55+
result = pipeline.fit(data).transform(data)
56+
```
57+
```scala
58+
val documentAssembler = new DocumentAssembler()
59+
.setInputCol("text")
60+
.setOutputCol("document")
61+
62+
val tokenizer = new Tokenizer()
63+
.setInputCols(Array("document"))
64+
.setOutputCol("token")
65+
66+
val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar")
67+
.setInputCols(Array("document", "token"))
68+
.setOutputCol("embeddings")
69+
70+
val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings))
71+
72+
val data = Seq("أنا أحب شرارة NLP").toDF("text")
73+
74+
val result = pipeline.fit(data).transform(data)
75+
```
76+
77+
78+
{:.nlu-block}
79+
```python
80+
import nlu
81+
nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""")
82+
```
83+
84+
</div>
85+
86+
{:.model-param}
87+
88+
<div class="tabs-box" markdown="1">
89+
{% include programmingLanguageSelectScalaPythonNLU.html %}
90+
```python
91+
documentAssembler = DocumentAssembler() \
92+
.setInputCol("text") \
93+
.setOutputCol("document")
94+
95+
tokenizer = Tokenizer() \
96+
.setInputCols("document") \
97+
.setOutputCol("token")
98+
99+
embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \
100+
.setInputCols(["document", "token"]) \
101+
.setOutputCol("embeddings")
102+
103+
pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings])
104+
105+
data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text")
106+
107+
result = pipeline.fit(data).transform(data)
108+
```
109+
```scala
110+
val documentAssembler = new DocumentAssembler()
111+
.setInputCol("text")
112+
.setOutputCol("document")
113+
114+
val tokenizer = new Tokenizer()
115+
.setInputCols(Array("document"))
116+
.setOutputCol("token")
117+
118+
val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar")
119+
.setInputCols(Array("document", "token"))
120+
.setOutputCol("embeddings")
121+
122+
val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings))
123+
124+
val data = Seq("أنا أحب شرارة NLP").toDF("text")
125+
126+
val result = pipeline.fit(data).transform(data)
127+
```
128+
129+
{:.nlu-block}
130+
```python
131+
import nlu
132+
nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""")
133+
```
134+
</div>
135+
136+
{:.model-param}
137+
## Model Information
138+
139+
{:.table-model}
140+
|---|---|
141+
|Model Name:|bert_embeddings_ARBERT|
142+
|Compatibility:|Spark NLP 5.0.0+|
143+
|License:|Open Source|
144+
|Edition:|Official|
145+
|Input Labels:|[sentence, token]|
146+
|Output Labels:|[bert]|
147+
|Language:|ar|
148+
|Size:|605.3 MB|
149+
|Case sensitive:|true|

0 commit comments

Comments
 (0)