From 1b503cdacd65429dd56ece91de673e34454cda56 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 13:34:52 +0100 Subject: [PATCH 1/7] CU-8699np02n: Update CDB legacy conversion so that it works with CDBs with no name_isupper attribute --- medcat-v2/medcat/utils/legacy/convert_cdb.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/utils/legacy/convert_cdb.py b/medcat-v2/medcat/utils/legacy/convert_cdb.py index 3122b0809..6399c5848 100644 --- a/medcat-v2/medcat/utils/legacy/convert_cdb.py +++ b/medcat-v2/medcat/utils/legacy/convert_cdb.py @@ -62,6 +62,7 @@ def load_old_raw_data(old_path: str) -> dict: ] NAME2KEYS = {'name2cuis', 'name2cuis2status', 'name2count_train', 'name_isupper'} +OPTIONAL_NAME2_KEYS = {"name_isupper", } CUI2KEYS = {'cui2names', 'cui2snames', 'cui2context_vectors', 'cui2count_train', 'cui2info', 'cui2tags', 'cui2type_ids', 'cui2preferred_name', 'cui2average_confidence', } @@ -167,6 +168,8 @@ def _add_cui_info(cdb: CDB, data: dict) -> CDB: def _add_name_info(cdb: CDB, data: dict) -> CDB: all_names = set() for key in NAME2KEYS: + if key in OPTIONAL_NAME2_KEYS and key not in data: + continue cnames = data[key].keys() logger.debug("Adding %d names based on '%s", len(cnames), key) all_names.update(cnames) @@ -181,7 +184,7 @@ def _add_name_info(cdb: CDB, data: dict) -> CDB: # so v2 only uses the latter since it provides extra information name2cuis2status = data['name2cuis2status'] name2cnt_train = data['name2count_train'] - name2is_upper = data['name_isupper'] + name2is_upper = data.get('name_isupper', {}) for name in all_names: cuis2status: dict[str, str] = {} _cuis2status = name2cuis2status.get(name, {}) From b5e8228582c3e298ed01a25e9e319b6a5ff523fa Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 15:00:35 +0100 Subject: [PATCH 2/7] CU-8699np02n: Add method to legacy converter to convert any config --- .../medcat/utils/legacy/convert_config.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/utils/legacy/convert_config.py b/medcat-v2/medcat/utils/legacy/convert_config.py index d83ee98c2..2c4731cac 100644 --- a/medcat-v2/medcat/utils/legacy/convert_config.py +++ b/medcat-v2/medcat/utils/legacy/convert_config.py @@ -1,5 +1,5 @@ import json -from typing import Any, cast, Optional +from typing import Any, cast, Optional, Type import logging from pydantic import BaseModel @@ -7,6 +7,7 @@ from medcat.config import Config from medcat.utils.legacy.helpers import fix_old_style_cnf +from medcat.config.config import SerialisableBaseModel logger = logging.getLogger(__name__) @@ -185,3 +186,34 @@ def get_config_from_old(path: str) -> Config: with open(path) as f: old_cnf_data = json.load(f) return get_config_from_nested_dict(old_cnf_data) + + +def get_config_from_old_per_cls( + path: str, cls: Type[SerialisableBaseModel]) -> SerialisableBaseModel: + """Convert the saved v1 config into a v2 Config for a specific class. + + Args: + path (str): The v1 config path. + cls (Type[SerialisableBaseModel]): The class to convert to. + + Returns: + SerialisableBaseModel: The converted config. + """ + from medcat.config.config_meta_cat import ConfigMetaCAT + from medcat.config.config_transformers_ner import ConfigTransformersNER + from medcat.config.config_rel_cat import ConfigRelCAT + if cls is Config: + return get_config_from_old(path) + elif cls is ConfigMetaCAT: + from medcat.utils.legacy.convert_meta_cat import ( + load_cnf as load_meta_cat_cnf) + return load_meta_cat_cnf(path) + elif cls is ConfigTransformersNER: + from medcat.utils.legacy.convert_deid import ( + get_cnf as load_deid_cnf) + return load_deid_cnf(path) + elif cls is ConfigRelCAT: + from medcat.utils.legacy.convert_rel_cat import ( + load_cnf as load_rel_cat_cnf) + return load_rel_cat_cnf(path) + raise ValueError(f"The config at '{path}' is not a {cls.__name__}!") From 97fa3e687279d2222c46b1a87f5acca510777268 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 15:03:35 +0100 Subject: [PATCH 3/7] CU-8699np02n: Fix config legacy conversion --- medcat-v2/medcat/config/config.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/medcat-v2/medcat/config/config.py b/medcat-v2/medcat/config/config.py index 273bcb42d..6f21bad0c 100644 --- a/medcat-v2/medcat/config/config.py +++ b/medcat-v2/medcat/config/config.py @@ -1,3 +1,4 @@ +import os from typing import (Optional, Iterator, Iterable, TypeVar, cast, Type, Any, Literal) from typing import Protocol, runtime_checkable @@ -12,6 +13,9 @@ from medcat.utils.defaults import workers from medcat.utils.envsnapshot import Environment, get_environment_info from medcat.utils.iterutils import callback_iterator +from medcat.utils.defaults import ( + avoid_legacy_conversion, doing_legacy_conversion_message, + LegacyConversionDisabledError) from medcat.storage.serialisables import SerialisingStrategy from medcat.storage.serialisers import deserialise @@ -80,6 +84,13 @@ def merge_config(self, other: dict): @classmethod def load(cls, path: str) -> Self: + if os.path.isfile(path) and path.endswith(".dat"): + if avoid_legacy_conversion(): + raise LegacyConversionDisabledError(cls.__name__) + doing_legacy_conversion_message(logger, cls.__name__, path) + from medcat.utils.legacy.convert_config import ( + get_config_from_old_per_cls) + return cast(Self, get_config_from_old_per_cls(path, cls)) obj = deserialise(path) if not isinstance(obj, cls): raise ValueError(f"The path '{path}' is not a {cls.__name__}!") From 464088ef62977d321c6b14c936f76143e4bed23a Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 15:22:08 +0100 Subject: [PATCH 4/7] CU-8699np02n: Add a few simple tests for Config legacy conversion --- .../tests/resources/mct_v1_deid_cnf.json | 1 + .../tests/resources/mct_v1_meta_cat_cnf.json | 1 + .../tests/resources/mct_v1_rel_cat_cnf.json | 1 + .../tests/utils/legacy/test_convert_config.py | 28 +++++++++++++++++-- 4 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 medcat-v2/tests/resources/mct_v1_deid_cnf.json create mode 100644 medcat-v2/tests/resources/mct_v1_meta_cat_cnf.json create mode 100644 medcat-v2/tests/resources/mct_v1_rel_cat_cnf.json diff --git a/medcat-v2/tests/resources/mct_v1_deid_cnf.json b/medcat-v2/tests/resources/mct_v1_deid_cnf.json new file mode 100644 index 000000000..4a9bf1f87 --- /dev/null +++ b/medcat-v2/tests/resources/mct_v1_deid_cnf.json @@ -0,0 +1 @@ +{"general": {"name": "NOT-DEID", "model_name": "roberta-base", "seed": 13, "description": "No description", "pipe_batch_size_in_chars": 20000000, "ner_aggregation_strategy": "simple", "chunking_overlap_window": 5, "test_size": 0.2, "last_train_on": null, "verbose_metrics": false}} \ No newline at end of file diff --git a/medcat-v2/tests/resources/mct_v1_meta_cat_cnf.json b/medcat-v2/tests/resources/mct_v1_meta_cat_cnf.json new file mode 100644 index 000000000..e9199225b --- /dev/null +++ b/medcat-v2/tests/resources/mct_v1_meta_cat_cnf.json @@ -0,0 +1 @@ +{"general": {"device": "cpu", "disable_component_lock": false, "seed": 13, "description": "No description", "category_name": "TEST CATEGORY", "alternative_category_names": [], "category_value2id": {}, "alternative_class_names": [[]], "vocab_size": 3, "lowercase": true, "cntx_left": 15, "cntx_right": 10, "replace_center": null, "batch_size_eval": 5000, "annotate_overlapping": false, "tokenizer_name": "bbpe", "save_and_reuse_tokens": false, "pipe_batch_size_in_chars": 20000000, "span_group": null}, "model": {"model_name": "lstm", "model_variant": "bert-base-uncased", "model_freeze_layers": true, "num_layers": 2, "input_size": 300, "hidden_size": 300, "dropout": 0.5, "phase_number": 0, "category_undersample": "", "model_architecture_config": {"fc2": true, "fc3": false, "lr_scheduler": true}, "num_directions": 2, "nclasses": 2, "padding_idx": -1, "emb_grad": true, "ignore_cpos": false}, "train": {"batch_size": 100, "nepochs": 50, "lr": 0.001, "test_size": 0.1, "shuffle_data": true, "class_weights": null, "compute_class_weights": false, "score_average": "weighted", "prerequisites": {}, "cui_filter": null, "auto_save_model": true, "last_train_on": null, "metric": {"base": "weighted avg", "score": "f1-score"}, "loss_funct": "cross_entropy", "gamma": 2}} \ No newline at end of file diff --git a/medcat-v2/tests/resources/mct_v1_rel_cat_cnf.json b/medcat-v2/tests/resources/mct_v1_rel_cat_cnf.json new file mode 100644 index 000000000..52ecec5b7 --- /dev/null +++ b/medcat-v2/tests/resources/mct_v1_rel_cat_cnf.json @@ -0,0 +1 @@ +{"general": {"device": "cpu", "relation_type_filter_pairs": [], "vocab_size": null, "lowercase": true, "cntx_left": 15, "cntx_right": 15, "window_size": 300, "limit_samples_per_class": -1, "addl_rels_max_sample_size": 200, "create_addl_rels": false, "create_addl_rels_by_type": false, "tokenizer_name": "bert", "model_name": "bert-unknown", "log_level": 20, "max_seq_length": 512, "tokenizer_special_tokens": false, "annotation_schema_tag_ids": [30522, 30523, 30524, 30525], "tokenizer_relation_annotation_special_tokens_tags": ["[s1]", "[e1]", "[s2]", "[e2]"], "tokenizer_other_special_tokens": {"pad_token": "[PAD]"}, "labels2idx": {}, "idx2labels": {}, "pin_memory": true, "seed": 13, "task": "train", "language": "en"}, "model": {"input_size": 300, "hidden_size": 768, "hidden_layers": 3, "model_size": 5120, "dropout": 0.2, "num_directions": 2, "freeze_layers": true, "padding_idx": -1, "emb_grad": true, "ignore_cpos": false, "llama_use_pooled_output": false}, "train": {"nclasses": 2, "batch_size": 25, "nepochs": 1, "lr": 0.0001, "stratified_batching": false, "batching_samples_per_class": [], "batching_minority_limit": 0, "adam_betas": [0.9, 0.999], "adam_weight_decay": 0, "adam_epsilon": 1e-08, "test_size": 0.2, "gradient_acc_steps": 1, "multistep_milestones": [2, 4, 6, 8, 12, 15, 18, 20, 22, 24, 26, 30], "multistep_lr_gamma": 0.8, "max_grad_norm": 1.0, "shuffle_data": true, "class_weights": null, "enable_class_weights": false, "score_average": "weighted", "auto_save_model": true}} \ No newline at end of file diff --git a/medcat-v2/tests/utils/legacy/test_convert_config.py b/medcat-v2/tests/utils/legacy/test_convert_config.py index fa476cbaa..7206eae47 100644 --- a/medcat-v2/tests/utils/legacy/test_convert_config.py +++ b/medcat-v2/tests/utils/legacy/test_convert_config.py @@ -1,14 +1,19 @@ +from typing import Type import os from medcat.utils.legacy import convert_config from medcat.config import Config +from medcat.config.config import SerialisableBaseModel +from medcat.config.config_meta_cat import ConfigMetaCAT +from medcat.config.config_rel_cat import ConfigRelCAT +from medcat.config.config_transformers_ner import ConfigTransformersNER import unittest -TESTS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), - "..", "..")) +from ... import RESOURCES_PATH +TESTS_PATH = os.path.dirname(RESOURCES_PATH) class ValAndModelGetterTests(unittest.TestCase): @@ -78,3 +83,22 @@ def test_migrates_partial(self): def test_preprocesses_sets(self): self.assertEqual(self.cnf.preprocessing.words_to_skip, self.EXP_WORDS_TO_SKIP) + + +class PerClsConfigConversionTests(unittest.TestCase): + PATHS_AND_CLASSES: list[str, Type[SerialisableBaseModel]] = [ + (os.path.join(RESOURCES_PATH, "mct_v1_cnf.json"), Config), + (os.path.join(RESOURCES_PATH, + "mct_v1_meta_cat_cnf.json"), ConfigMetaCAT), + (os.path.join(RESOURCES_PATH, + "mct_v1_rel_cat_cnf.json"), ConfigRelCAT), + (os.path.join(RESOURCES_PATH, + "mct_v1_deid_cnf.json"), ConfigTransformersNER), + ] + + def test_can_convert(self): + for path, cls in self.PATHS_AND_CLASSES: + with self.subTest(f"Testing {cls.__name__} at {path}"): + cnf = convert_config.get_config_from_old_per_cls(path, cls) + self.assertIsInstance( + cnf, cls, f"Failed for {cls.__name__} at {path}") From 9532056421bf48edb4006034241c88b2564ac37f Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 15:30:23 +0100 Subject: [PATCH 5/7] CU-8699np02n: Add a little more sophistication to general config conversion tests --- .../tests/utils/legacy/test_convert_config.py | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/medcat-v2/tests/utils/legacy/test_convert_config.py b/medcat-v2/tests/utils/legacy/test_convert_config.py index 7206eae47..014eaa7b0 100644 --- a/medcat-v2/tests/utils/legacy/test_convert_config.py +++ b/medcat-v2/tests/utils/legacy/test_convert_config.py @@ -1,4 +1,4 @@ -from typing import Type +from typing import Type, Any import os from medcat.utils.legacy import convert_config @@ -86,19 +86,44 @@ def test_preprocesses_sets(self): class PerClsConfigConversionTests(unittest.TestCase): - PATHS_AND_CLASSES: list[str, Type[SerialisableBaseModel]] = [ - (os.path.join(RESOURCES_PATH, "mct_v1_cnf.json"), Config), + # paths, classes, expected path, expected value + # NOTE: These are hard-coded values I know I changed in the confgis + # before saving + PATHS_AND_CLASSES: list[str, Type[SerialisableBaseModel], str, Any] = [ (os.path.join(RESOURCES_PATH, - "mct_v1_meta_cat_cnf.json"), ConfigMetaCAT), + "mct_v1_cnf.json"), Config, + 'meta.description', "FAKE MODEL"), (os.path.join(RESOURCES_PATH, - "mct_v1_rel_cat_cnf.json"), ConfigRelCAT), + "mct_v1_meta_cat_cnf.json"), ConfigMetaCAT, + "general.category_name", 'TEST CATEGORY'), (os.path.join(RESOURCES_PATH, - "mct_v1_deid_cnf.json"), ConfigTransformersNER), + "mct_v1_rel_cat_cnf.json"), ConfigRelCAT, + "general.model_name", 'bert-unknown'), + (os.path.join(RESOURCES_PATH, + "mct_v1_deid_cnf.json"), ConfigTransformersNER, + "general.name", 'NOT-DEID'), ] + @classmethod + def setUpClass(cls): + return super().setUpClass() + + def _get_attr_nested(self, obj: SerialisableBaseModel, path: str) -> Any: + """Get an attribute from a nested object using a dot-separated path.""" + parts = path.split('.') + for part in parts: + obj = getattr(obj, part) + return obj + + def assert_can_convert( + self, path, cls: Type[SerialisableBaseModel], + exp_path: str, exp_value: Any): + cnf = convert_config.get_config_from_old_per_cls(path, cls) + self.assertIsInstance(cnf, cls, f"Failed for {cls.__name__}") + self.assertEqual(self._get_attr_nested(cnf, exp_path), exp_value, + f"Failed for {cls.__name__} at {exp_path}") + def test_can_convert(self): - for path, cls in self.PATHS_AND_CLASSES: + for path, cls, exp_path, exp_value in self.PATHS_AND_CLASSES: with self.subTest(f"Testing {cls.__name__} at {path}"): - cnf = convert_config.get_config_from_old_per_cls(path, cls) - self.assertIsInstance( - cnf, cls, f"Failed for {cls.__name__} at {path}") + self.assert_can_convert(path, cls, exp_path, exp_value) From a2542c3fff829319eb53cf16cf644420c088037b Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 16:34:02 +0100 Subject: [PATCH 6/7] CU-8699np02n: Simplify MetaCAT deserialisation from legacy data --- medcat-v2/medcat/components/addons/meta_cat/meta_cat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py index 985a42377..079497688 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py +++ b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py @@ -180,6 +180,9 @@ def deserialise_from(cls, folder_path: str, **init_kwargs cnf = init_kwargs['cnf'] else: config_path = os.path.join(folder_path, "meta_cat", "config") + if not os.path.exists(config_path): + # load legacy config (assuming it exists) + config_path += ".dat" logger.info( "Was not provide a config when loading a meta cat from '%s'. " "Inferring config from file at '%s'", folder_path, From 9b4021408e9d9c2aaaa04b60b9366db9ec2a5191 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 4 Jul 2025 19:04:38 +0100 Subject: [PATCH 7/7] CU-8699np02n: Fix MetaCAT legacy conversion --- .../components/addons/meta_cat/meta_cat.py | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py index 079497688..caf13908f 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py +++ b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py @@ -27,6 +27,9 @@ from medcat.cdb import CDB from medcat.vocab import Vocab from medcat.utils.defaults import COMPONENTS_FOLDER +from medcat.utils.defaults import ( + avoid_legacy_conversion, doing_legacy_conversion_message, + LegacyConversionDisabledError) from peft import get_peft_model, LoraConfig, TaskType # It should be safe to do this always, as all other multiprocessing @@ -173,9 +176,39 @@ def serialise_to(self, folder_path: str) -> None: os.mkdir(folder_path) self.save(folder_path) + @classmethod + def _create_throwaway_tokenizer(cls) -> BaseTokenizer: + from medcat.tokenizing.tokenizers import create_tokenizer + from medcat.config import Config + logger.warning( + "A base tokenizer was not provided during the loading of a " + "MetaCAT. The tokenizer is used to register the required data " + "paths for MetaCAT to function. Using the default of '%s'. If " + "this it not the tokenizer you will end up using, MetaCAT may " + "be unable to recover unless a) the paths are registered " + "explicitly, or b) there are other MetaCATs created with the " + "correct tokenizer. Do note that this will also create " + "another instance of the tokenizer, though it should be " + "garbage collected soon.", cls.DEFAULT_TOKENIZER + ) + # NOTE: the use of a (mostly) default config here probably won't + # affect anything since the tokenizer itself won't be used + gcnf = Config() + gcnf.general.nlp.provider = 'spacy' + return create_tokenizer(cls.DEFAULT_TOKENIZER, gcnf) + @classmethod def deserialise_from(cls, folder_path: str, **init_kwargs ) -> 'MetaCATAddon': + if "model.dat" in os.listdir(folder_path): + if not avoid_legacy_conversion(): + doing_legacy_conversion_message( + logger, cls.__name__, folder_path) + from medcat.utils.legacy.convert_meta_cat import ( + get_meta_cat_from_old) + return get_meta_cat_from_old( + folder_path, cls._create_throwaway_tokenizer()) + raise LegacyConversionDisabledError(cls.__name__,) if 'cnf' in init_kwargs: cnf = init_kwargs['cnf'] else: @@ -191,24 +224,7 @@ def deserialise_from(cls, folder_path: str, **init_kwargs if 'tokenizer' in init_kwargs: tokenizer = init_kwargs['tokenizer'] else: - from medcat.tokenizing.tokenizers import create_tokenizer - from medcat.config import Config - logger.warning( - "A base tokenizer was not provided during the loading of a " - "MetaCAT. The tokenizer is used to register the required data " - "paths for MetaCAT to function. Using the default of '%s'. If " - "this it not the tokenizer you will end up using, MetaCAT may " - "be unable to recover unless a) the paths are registered " - "explicitly, or b) there are other MetaCATs created with the " - "correct tokenizer. Do note that this will also create " - "another instance of the tokenizer, though it should be " - "garbage collected soon.", cls.DEFAULT_TOKENIZER - ) - # NOTE: the use of a (mostly) default config here probably won't - # affect anything since the tokenizer itself won't be used - gcnf = Config() - gcnf.general.nlp.provider = 'spacy' - tokenizer = create_tokenizer(cls.DEFAULT_TOKENIZER, gcnf) + tokenizer = cls._create_throwaway_tokenizer() return cls.load_existing( load_path=folder_path, cnf=cnf,