Skip to content

Commit b75c406

Browse files
authored
Merge pull request #211 from ZachEichen/master
Create cleaning util module mirroring util.py
2 parents 89d93e2 + 660715c commit b75c406

File tree

10 files changed

+11871
-6013
lines changed

10 files changed

+11871
-6013
lines changed

text_extensions_for_pandas/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,13 @@
4040
# Sub-modules
4141
from text_extensions_for_pandas import io
4242
from text_extensions_for_pandas import spanner
43+
from text_extensions_for_pandas import cleaning
4344

4445
# Sphinx autodoc needs this redundant listing of public symbols to list the contents
4546
# of this subpackage.
4647
__all__ = [
4748
"Span", "SpanDtype", "SpanArray",
4849
"TokenSpan", "TokenSpanDtype", "TokenSpanArray",
4950
"TensorElement", "TensorDtype", "TensorArray",
50-
"io"
51+
"io", 'cleaning'
5152
]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#
2+
# Copyright (c) 2021 IBM Corp.
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
################################################################################
17+
# cleaning module
18+
#
19+
# Functions in text_extensions_for_pandas that allow for identification of
20+
# possibly incorrect labels, and quick training of models on bert embeddings
21+
# of a corpus
22+
23+
# Expose the public APIs that users should get from importing the top-level
24+
# library.
25+
26+
from text_extensions_for_pandas.cleaning import ensemble
27+
from text_extensions_for_pandas.cleaning import analysis
28+
from text_extensions_for_pandas.cleaning import preprocess
29+
30+
# import important functions from each module
31+
from text_extensions_for_pandas.cleaning.preprocess import (
32+
preprocess_documents,
33+
combine_raw_spans_docs,
34+
)
35+
from text_extensions_for_pandas.cleaning.analysis import (
36+
flag_suspicious_labels,
37+
create_f1_score_report,
38+
create_f1_score_report_iob,
39+
)
40+
from text_extensions_for_pandas.cleaning.ensemble import (
41+
train_reduced_model,
42+
train_model_ensemble,
43+
infer_and_extract_entities_iob,
44+
infer_and_extract_raw_entites,
45+
infer_on_df,
46+
)
47+
48+
__all__ = ["ensemble", "analysis", "preprocess"]

0 commit comments

Comments
 (0)