Skip to content

Commit fe5388a

Browse files
authored
CU-8699may9k Allow CDB and Vocab load to convert from legacy (#21)
* CU-8699may9k: Allow CDB to convert a legacy version upon load time * CU-8699may9k: Allow Vocab to convert legacy version upon load time * CU-8699may9k: Add small test for legacy conversion upon CDB load * CU-8699may9k: Add small test for legacy conversion upon Vocab load * CU-8699may9k: Allow disabling automatic legacy conversion by env variable when loading CBD * CU-8699may9k: Allow disabling automatic legacy conversion by env variable when loading Vocab * CU-8699may9k: Centralise decision on automatic legacy conversion * CU-8699may9k: Move back to calculating legacy conversion on the fly. Yet still do it centrally * CU-8699may9k: Centralise exception for disabled legacy conversion * CU-8699may9k: Centralise log message regarding legacy conversion * CU-8699may9k: Use centralised legacy conversion message for CDB and Vocab conversion * CU-8699may9k: Update CAT tests to capture correct exception when legacy conversion disabled * CU-8699may9k: Fix issue with loading legacy CDB as regular. More specifically, loading it as regular ON TOP of doing the conversion * CU-8699may9k: Fix test issues. PS: This shouldn't have passed workflow before, no idea why it did and now doesn't.
1 parent 0a02455 commit fe5388a

File tree

8 files changed

+80
-18
lines changed

8 files changed

+80
-18
lines changed

medcat-v2-tutorials/notebooks/introductory/relcat/2._Infering_relations_from_annotations_with_Relation_toolkit.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "ec4a8509",
77
"metadata": {},
88
"outputs": [
@@ -92,7 +92,7 @@
9292
],
9393
"source": [
9494
"# Install medcat\n",
95-
"! pip install \"medcat[spacy,meta-cat] @ git+https://github.com/CogStack/cogstack-nlp@medcat/v0.11.2#subdirectory=medcat-v2\" # NOTE: VERSION-STRING"
95+
"! pip install \"medcat[spacy,rel-cat] @ git+https://github.com/CogStack/cogstack-nlp@medcat/v0.11.2#subdirectory=medcat-v2\" # NOTE: VERSION-STRING"
9696
]
9797
},
9898
{

medcat-v2/medcat/cat.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
from medcat.components.types import AbstractCoreComponent, HashableComponet
2828
from medcat.components.addons.addons import AddonComponent
2929
from medcat.utils.legacy.identifier import is_legacy_model_pack
30-
from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON
30+
from medcat.utils.defaults import avoid_legacy_conversion
31+
from medcat.utils.defaults import doing_legacy_conversion_message
32+
from medcat.utils.defaults import LegacyConversionDisabledError
3133
from medcat.utils.usage_monitoring import UsageMonitor
3234

3335

@@ -602,22 +604,13 @@ def load_model_pack(cls, model_pack_path: str) -> 'CAT':
602604
logger.info("Attempting to load model from file: %s",
603605
model_pack_path)
604606
is_legacy = is_legacy_model_pack(model_pack_path)
605-
should_avoid = os.environ.get(
606-
AVOID_LEGACY_CONVERSION_ENVIRON, "False").lower() == "true"
607-
if is_legacy and not should_avoid:
607+
avoid_legacy = avoid_legacy_conversion()
608+
if is_legacy and not avoid_legacy:
608609
from medcat.utils.legacy.conversion_all import Converter
609-
logger.warning(
610-
"Doing legacy conversion on model pack '%s'. "
611-
"This will make the model load take significantly longer. "
612-
"If you wish to avoid this, set the environment variable '%s' "
613-
"to 'true'", model_pack_path, AVOID_LEGACY_CONVERSION_ENVIRON)
610+
doing_legacy_conversion_message(logger, 'CAT', model_pack_path)
614611
return Converter(model_pack_path, None).convert()
615-
elif is_legacy and should_avoid:
616-
raise ValueError(
617-
f"The model pack '{model_pack_path}' is a legacy model pack. "
618-
"Please set the environment variable "
619-
f"'{AVOID_LEGACY_CONVERSION_ENVIRON}' "
620-
"to 'true' to allow automatic conversion.")
612+
elif is_legacy and avoid_legacy:
613+
raise LegacyConversionDisabledError("CAT")
621614
# NOTE: ignoring addons since they will be loaded later / separately
622615
cat = deserialise(model_pack_path, model_load_path=model_pack_path,
623616
ignore_folders_prefix={

medcat-v2/medcat/cdb/cdb.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import Iterable, Any, Collection, Union, Literal
2+
import os
23

34
from medcat.storage.serialisables import AbstractSerialisable
45
from medcat.cdb.concepts import CUIInfo, NameInfo, TypeInfo
@@ -9,6 +10,9 @@
910
from medcat.storage.zip_utils import (
1011
should_serialise_as_zip, serialise_as_zip, deserialise_from_zip)
1112
from medcat.utils.defaults import default_weighted_average, StatusTypes as ST
13+
from medcat.utils.defaults import avoid_legacy_conversion
14+
from medcat.utils.defaults import doing_legacy_conversion_message
15+
from medcat.utils.defaults import LegacyConversionDisabledError
1216
from medcat.utils.hasher import Hasher
1317
from medcat.preprocessors.cleaners import NameDescriptor
1418
from medcat.config import Config
@@ -510,6 +514,13 @@ def save(self, save_path: str,
510514
def load(cls, path: str) -> 'CDB':
511515
if should_serialise_as_zip(path, 'auto'):
512516
cdb = deserialise_from_zip(path)
517+
elif os.path.isfile(path) and path.endswith('.dat'):
518+
if not avoid_legacy_conversion():
519+
from medcat.utils.legacy.convert_cdb import get_cdb_from_old
520+
doing_legacy_conversion_message(logger, 'CDB', path)
521+
cdb = get_cdb_from_old(path)
522+
else:
523+
raise LegacyConversionDisabledError("CDB")
513524
else:
514525
cdb = deserialise(path)
515526
if not isinstance(cdb, CDB):

medcat-v2/medcat/utils/defaults.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import os
12
from typing import Optional
23
from multiprocessing import cpu_count
34
from functools import lru_cache
5+
import logging
46

57

68
DEFAULT_SPACY_MODEL = 'en_core_web_md'
@@ -9,6 +11,33 @@
911
AVOID_LEGACY_CONVERSION_ENVIRON = "MEDCAT_AVOID_LECACY_CONVERSION"
1012

1113

14+
def avoid_legacy_conversion() -> bool:
15+
return os.environ.get(
16+
AVOID_LEGACY_CONVERSION_ENVIRON, "False").lower() == "true"
17+
18+
19+
class LegacyConversionDisabledError(Exception):
20+
"""Raised when legacy conversion is disabled."""
21+
22+
def __init__(self, component_name: str):
23+
super().__init__(
24+
f"Legacy conversion is disabled (while loading {component_name}). "
25+
f"Set the environment variable {AVOID_LEGACY_CONVERSION_ENVIRON} "
26+
"to `False` to allow conversion.")
27+
28+
29+
def doing_legacy_conversion_message(
30+
logger: logging.Logger, component_name: str, file_path: str = '',
31+
level: int = logging.WARNING
32+
) -> None:
33+
logger.log(
34+
level,
35+
"Doing legacy conversion on %s (at '%s'). "
36+
"Set the environment variable %s "
37+
"to `True` to avoid this.",
38+
component_name, file_path, AVOID_LEGACY_CONVERSION_ENVIRON)
39+
40+
1241
@lru_cache(maxsize=100)
1342
def default_weighted_average(step: int, factor: float = 0.0004) -> float:
1443
return max(0.1, 1 - (step ** 2 * factor))

medcat-v2/medcat/vocab.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Optional, Any, cast, Union, Literal
22
from typing_extensions import TypedDict
3+
import os
4+
import logging
35

46
# import dill
57
import numpy as np
@@ -9,6 +11,12 @@
911
deserialise, AvailableSerialisers, serialise)
1012
from medcat.storage.zip_utils import (
1113
should_serialise_as_zip, serialise_as_zip, deserialise_from_zip)
14+
from medcat.utils.defaults import avoid_legacy_conversion
15+
from medcat.utils.defaults import doing_legacy_conversion_message
16+
from medcat.utils.defaults import LegacyConversionDisabledError
17+
18+
19+
logger = logging.getLogger(__name__)
1220

1321

1422
WordDescriptor = TypedDict('WordDescriptor',
@@ -323,6 +331,14 @@ def save(self, save_path: str,
323331
def load(cls, path: str) -> 'Vocab':
324332
if should_serialise_as_zip(path, 'auto'):
325333
vocab = deserialise_from_zip(path)
334+
elif os.path.isfile(path) and path.endswith('.dat'):
335+
if not avoid_legacy_conversion():
336+
from medcat.utils.legacy.convert_vocab import (
337+
get_vocab_from_old)
338+
doing_legacy_conversion_message(logger, 'Vocab', path)
339+
vocab = get_vocab_from_old(path)
340+
else:
341+
raise LegacyConversionDisabledError("Vocab")
326342
else:
327343
vocab = deserialise(path)
328344
if not isinstance(vocab, Vocab):

medcat-v2/tests/cdb/test_cdb.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
import tempfile
1111

1212
from .. import UNPACKED_EXAMPLE_MODEL_PACK_PATH, RESOURCES_PATH
13+
from .. import UNPACKED_V1_MODEL_PACK_PATH
1314

1415

1516
ZIPPED_CDB_PATH = os.path.join(RESOURCES_PATH, "mct2_cdb.zip")
1617

1718

1819
class CDBTests(TestCase):
1920
CDB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "cdb")
21+
LEGACY_CDB_PATH = os.path.join(UNPACKED_V1_MODEL_PACK_PATH, "cdb.dat")
2022
CUI_TO_REMOVE = "C03"
2123
NAMES_TO_REMOVE = ['high~temperature']
2224
TO_FILTER = ['C01', 'C02']
@@ -40,6 +42,10 @@ def test_can_load_from_zip(self):
4042
# make sure it's actually a file not a folder
4143
self.assertTrue(os.path.isfile(ZIPPED_CDB_PATH))
4244

45+
def test_can_convert_legacy_upon_load(self):
46+
loaded = cdb.CDB.load(self.LEGACY_CDB_PATH)
47+
self.assertIsInstance(loaded, cdb.CDB)
48+
4349
def test_can_save_to_zip(self):
4450
with tempfile.TemporaryDirectory() as temp_dir:
4551
file_name = os.path.join(temp_dir, "cdb.zip")

medcat-v2/tests/test_cat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from medcat.utils.cdb_state import captured_state_cdb
1818
from medcat.components.addons.meta_cat import MetaCATAddon
1919
from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON
20+
from medcat.utils.defaults import LegacyConversionDisabledError
2021

2122
import unittest
2223
import tempfile
@@ -648,7 +649,7 @@ def test_can_load_legacy_model_unpacked(self):
648649
def test_cannot_load_legacy_with_environ_set(self):
649650
with unittest.mock.patch.dict(os.environ, {
650651
AVOID_LEGACY_CONVERSION_ENVIRON: "true"}, clear=True):
651-
with self.assertRaises(ValueError):
652+
with self.assertRaises(LegacyConversionDisabledError):
652653
cat.CAT.load_model_pack(V1_MODEL_PACK_PATH)
653654

654655

medcat-v2/tests/test_vocab.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import tempfile
1010

1111
from . import UNPACKED_EXAMPLE_MODEL_PACK_PATH, RESOURCES_PATH
12+
from . import UNPACKED_V1_MODEL_PACK_PATH
1213

1314

1415
ZIPPED_VOCAB_PATH = os.path.join(RESOURCES_PATH, "mct2_vocab.zip")
@@ -169,6 +170,7 @@ def test_neg_sampling_does_not_include_vectorless(
169170

170171
class DefaultVocabTests(unittest.TestCase):
171172
VOCAB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, 'vocab')
173+
LEGACY_VOCAB_PATH = os.path.join(UNPACKED_V1_MODEL_PACK_PATH, "vocab.dat")
172174
EXP_SHAPE = (7,)
173175

174176
@classmethod
@@ -199,6 +201,10 @@ def test_can_load_from_zip(self):
199201
vocab = Vocab.load(ZIPPED_VOCAB_PATH)
200202
self.assertIsInstance(vocab, Vocab)
201203

204+
def test_can_convert_legacy_upon_load(self):
205+
loaded = Vocab.load(self.LEGACY_VOCAB_PATH)
206+
self.assertIsInstance(loaded, Vocab)
207+
202208
def test_can_save_to_zip(self):
203209
with tempfile.TemporaryDirectory() as temp_dir:
204210
file_name = os.path.join(temp_dir, 'vocab.zip')

0 commit comments

Comments
 (0)