Revert "Pushing change for MetaCAT training"

shubham-s-agarwal · shubham-s-agarwal · commit f214549907db · 2025-05-22T14:45:00.000+01:00
This reverts commit e6c0041.
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
@@ -2,17 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d58c720d",
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
     "import os\n",
     "from datetime import date\n",
+    "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
     "from medcat.config_meta_cat import ConfigMetaCAT\n",
-    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
+    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
+    "from tokenizers import ByteLevelBPETokenizer"
    ]
   },
   {
@@ -76,39 +78,35 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2933f7e1",
+   "cell_type": "markdown",
+   "id": "35aa5605",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "for meta_model in meta_model_names:\n",
-    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
-    "    with open(config_file, 'r') as jfile:\n",
-    "        config_dict = json.load(jfile)\n",
-    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
+    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
+    "\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3047b1d9",
+   "id": "8bf6f5c3",
    "metadata": {},
    "source": [
-    "<b> Note: </b> \n",
-    " The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
-    " <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
-    "<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
+    "Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
+    "If you are unsure, use this section to check the model type."
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "12e91f77",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2933f7e1",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "<b> Note: </b> \n",
-    " The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
-    " <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
-    "<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
+    "for meta_model in meta_model_names:\n",
+    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
+    "    with open(config_file, 'r') as jfile:\n",
+    "        config_dict = json.load(jfile)\n",
+    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
    ]
   },
   {
@@ -133,11 +131,9 @@
     "\n",
     "    # changing parameters\n",
     "    mc.config.train['nepochs'] = 15\n",
-    "    \n",
-    "    # current model will be overwritten\n",
-    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
-    "    # to save the new model elsewhere, uncomment the below line\n",
-    "    #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
+    "\n",
+    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
+    "    #Ideally this should replace the meta_models inside the modelpack\n",
     "\n",
     "    # train the meta_model\n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
@@ -151,8 +147,7 @@
    "id": "ab23e424",
    "metadata": {},
    "source": [
-    "## If you dont have the model packs, and are training from scratch\n",
-    "<b>This is very rare, it is recommended to always use the model packs and then fine-tune them</b>"
+    "## If you dont have the model packs, and are training from scratch"
    ]
   },
   {
@@ -172,7 +167,8 @@
     "\n",
     "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
     "\n",
-    "save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
+    "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
+    "#Ideally this should replace the meta_models inside the modelpack\n",
     "\n",
     "# Initialise and train meta_model\n",
     "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb
@@ -10,16 +10,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d58c720d",
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
     "import os\n",
     "from datetime import date\n",
+    "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
-    "from medcat.config_meta_cat import ConfigMetaCAT"
+    "from medcat.config_meta_cat import ConfigMetaCAT\n",
+    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
+    "from tokenizers import ByteLevelBPETokenizer"
    ]
   },
   {
@@ -85,24 +88,11 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d4a3632b",
-   "metadata": {},
-   "source": [
-    "<b> Note: </b> \n",
-    " The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
-    " <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
-    "<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d8bdc404",
+   "id": "35aa5605",
    "metadata": {},
    "source": [
-    "<b> Note: </b> \n",
-    " The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
-    " <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
-    "<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
+    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
+    "\n"
    ]
   },
   {
@@ -193,14 +183,9 @@
     "    if class_wt_phase1:\n",
     "        mc.config.train['class_weights'] = class_wt_phase1\n",
     "\n",
-    "    #You can change the number of epochs, remember to keep them higher for phase 1\n",
-    "    mc.config.train['nepochs'] = 40 \n",
-    "\n",
-    "    # current model will be overwritten\n",
-    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
-    "    # to save the new model elsewhere, uncomment the below line\n",
-    "    #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
+    "    mc.config.train['nepochs'] = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n",
     "\n",
+    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
     "    # Save results\n",
     "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n",
@@ -217,21 +202,18 @@
     "    if class_wt_phase2:\n",
     "        mc.config.train['class_weights'] = class_wt_phase2\n",
     "\n",
-    "    #You can change the number of epochs\n",
-    "    mc.config.train['nepochs'] = 20\n",
+    "    mc.config.train['nepochs'] = 15\n",
     "\n",
-    "    # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
-    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
-    "    \n",
+    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
     "    # Save results\n",
     "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n",
     "\n",
     "#--------------------------------Driver--------------------------------\n",
     "for meta_model in meta_model_names:\n",
-    "    #To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n",
-    "    class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n",
-    "    class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n",
+    "    #To use your own class weights instead of the pre-defined ones for the 2 phases, uncomment the below lines\n",
+    "    '''class_wt_phase1 = []\n",
+    "    class_wt_phase2 = []'''\n",
     "\n",
     "    # Train 2 phase learning\n",
     "    logger.info(\"\\n********************Beginning Phase 1********************\")\n",
@@ -275,7 +257,7 @@
     "# Follow all the same steps till initializing the metacat model\n",
     "\n",
     "# Initialise and train meta_model\n",
-    "mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n",
+    "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
     "\n",
     "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
     "#                ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",