From b58013df0542127151a9298e8bddd989c0b13932 Mon Sep 17 00:00:00 2001 From: spencerthomas1722 Date: Tue, 29 Apr 2025 09:46:45 -0400 Subject: [PATCH 1/5] update dapt for to work with current tasks --- src/cnlpt/CnlpModelForClassification.py | 35 ++++++++-- src/cnlpt/cnlp_args.py | 36 +++++----- src/cnlpt/cnlp_data.py | 18 +++-- src/cnlpt/cnlp_processors.py | 2 +- src/cnlpt/dapt.py | 81 ++++++++++++++++++---- src/cnlpt/train_system.py | 92 +++++++++++++++++-------- 6 files changed, 191 insertions(+), 73 deletions(-) diff --git a/src/cnlpt/CnlpModelForClassification.py b/src/cnlpt/CnlpModelForClassification.py index 30fdde88..30d0caf1 100644 --- a/src/cnlpt/CnlpModelForClassification.py +++ b/src/cnlpt/CnlpModelForClassification.py @@ -287,7 +287,7 @@ def __init__( self.encoder = encoder_model.from_pretrained(config.encoder_name) # part of the motivation for leaving this # logic alone for character level models is that - # at the time of writing, CANINE and Flair are the only game in town. + # at the time of writing, CANINE and Flair are the only game in town. # CANINE's hashable embeddings for unicode codepoints allows for # additional parameterization, which rn doesn't seem so relevant if not config.character_level: @@ -329,12 +329,12 @@ def __init__( head_size=config.rel_attention_head_dims, ) if config.relations[task_name]: - hidden_size = config.num_rel_attention_heads - if config.use_prior_tasks: - hidden_size += total_prev_task_labels + # hidden_size = config.num_rel_attention_heads + # if config.use_prior_tasks: + # hidden_size += total_prev_task_labels self.classifiers[task_name] = ClassificationHead( - config, task_num_labels, hidden_size=hidden_size + config, task_num_labels, ) else: self.classifiers[task_name] = ClassificationHead( @@ -491,6 +491,30 @@ def compute_loss( ) state["loss"] += task_weight * task_loss + def remove_task_classifiers(self, tasks: list[str] = None): + if tasks is None: + self.classifiers = nn.ModuleDict() + self.tasks = [] + self.class_weights = {} + else: + for task in tasks: + self.classifiers.pop(task) + self.tasks.remove(task) + self.class_weights.pop(task) + + def add_task_classifier(self, task_name: str, label_dictionary: dict[str, list]): + self.tasks.append(task_name) + self.classifiers[task_name] = ClassificationHead( + self.config, len(label_dictionary) + ) + self.label_dictionary[task_name] = label_dictionary + + def set_class_weights(self, class_weights: Union[list[float], None] = None): + if class_weights is None: + self.class_weights = {x: None for x in self.label_dictionary.keys()} + else: + self.class_weights = class_weights + def forward( self, input_ids=None, @@ -531,7 +555,6 @@ def forward( Returns: (`transformers.SequenceClassifierOutput`) the output of the model """ - kwargs = generalize_encoder_forward_kwargs( self.encoder, attention_mask=attention_mask, diff --git a/src/cnlpt/cnlp_args.py b/src/cnlpt/cnlp_args.py index d3d45c4a..d07d0c59 100644 --- a/src/cnlpt/cnlp_args.py +++ b/src/cnlpt/cnlp_args.py @@ -298,18 +298,18 @@ class DaptArguments: "help": "Pretrained tokenizer name or path if not the same as model_name" }, ) - output_dir: Union[str, None] = field( - default=None, metadata={"help": "Directory path to write trained model to."} - ) - overwrite_output_dir: bool = field( - default=False, - metadata={ - "help": ( - "Overwrite the content of the output directory. " - "Use this to continue training if output_dir points to a checkpoint directory." - ) - }, - ) + # output_dir: Union[str, None] = field( + # default=None, metadata={"help": "Directory path to write trained model to."} + # ) + # overwrite_output_dir: bool = field( + # default=False, + # metadata={ + # "help": ( + # "Overwrite the content of the output directory. " + # "Use this to continue training if output_dir points to a checkpoint directory." + # ) + # }, + # ) data_dir: Union[str, None] = field( default=None, metadata={"help": "The data dir for domain-adaptive pretraining."} ) @@ -333,12 +333,12 @@ class DaptArguments: default=0.2, metadata={"help": "The test split proportion for domain-adaptive pretraining."}, ) - seed: int = field( - default=42, - metadata={ - "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)." - }, - ) + # seed: int = field( + # default=42, + # metadata={ + # "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)." + # }, + # ) no_eval: bool = field( default=False, metadata={"help": "Don't split into train and test; just pretrain."}, diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py index ba9090e8..e2033032 100644 --- a/src/cnlpt/cnlp_data.py +++ b/src/cnlpt/cnlp_data.py @@ -7,6 +7,8 @@ from enum import Enum from typing import Union +import pdb + import datasets import numpy as np from datasets import Dataset as HFDataset @@ -15,8 +17,8 @@ from transformers import BatchEncoding, DataCollatorForLanguageModeling from transformers.tokenization_utils import PreTrainedTokenizer -from .cnlp_args import DaptArguments -from .cnlp_processors import AutoProcessor, classification, relex, tagging +from cnlp_args import DaptArguments +from cnlp_processors import AutoProcessor, classification, relex, tagging special_tokens = ["", "", "", "", "", "", "", ""] text_columns = ["text", "text_a", "text_b"] @@ -1155,10 +1157,14 @@ def __init__( batched=True, remove_columns=list(remove_columns), ) - dataset = dataset.map( - functools.partial(group_texts, self.args.chunk_size), - batched=True, - ) + + dataset = dataset.remove_columns("word_ids") + # dataset = dataset.map( + # functools.partial(group_texts, self.args.chunk_size), + # batched=True, + # ) + + if isinstance(dataset, (DatasetDict, IterableDatasetDict)) or args.no_eval: self.dataset = dataset diff --git a/src/cnlpt/cnlp_processors.py b/src/cnlpt/cnlp_processors.py index db496505..997a120d 100644 --- a/src/cnlpt/cnlp_processors.py +++ b/src/cnlpt/cnlp_processors.py @@ -171,7 +171,7 @@ def __init__(self, data_dir: str, tasks: set[str] = None, max_train_items=-1): else: sep = "\t" - self.dataset = load_dataset("csv", sep=sep, data_files=data_files) + self.dataset = load_dataset("csv", sep=sep, data_files=data_files, keep_default_na=False) ## find out what tasks are available to this dataset, and see the overlap with what the ## user specified at the cli, remove those tasks so we don't also get them from other datasets diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py index 4463b9b8..4e85679b 100644 --- a/src/cnlpt/dapt.py +++ b/src/cnlpt/dapt.py @@ -8,6 +8,7 @@ from typing import Any, Union from transformers import ( + AutoConfig, AutoModelForMaskedLM, AutoTokenizer, HfArgumentParser, @@ -16,12 +17,64 @@ set_seed, ) +from torch.nn import CrossEntropyLoss +from transformers.modeling_outputs import MaskedLMOutput +from transformers.modeling_utils import PreTrainedModel + +from .CnlpModelForClassification import CnlpConfig, generalize_encoder_forward_kwargs from .cnlp_args import DaptArguments from .cnlp_data import DaptDataset logger = logging.getLogger(__name__) +class DaptModel(PreTrainedModel): + base_model_prefix = "cnlpt" + config_class = CnlpConfig + + def __init__( + self, + config: config_class, + ): + super().__init__(config) + encoder_config = AutoConfig.from_pretrained(config._name_or_path) + encoder_config.vocab_size = config.vocab_size + config.encoder_config = encoder_config.to_dict() + model = AutoModelForMaskedLM.from_config(encoder_config) + self.encoder = model.from_pretrained(config._name_or_path) + # if not config.character_level: + self.encoder.resize_token_embeddings(encoder_config.vocab_size) + + def forward( + self, + input_ids, + token_type_ids, + attention_mask, + labels, + ): + kwargs = generalize_encoder_forward_kwargs( + self.encoder, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True, + return_dict=True, + ) + + outputs = self.encoder(input_ids, **kwargs) + logits = outputs.logits + + if labels is not None: + loss_fn = CrossEntropyLoss() + loss = loss_fn(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + return MaskedLMOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def main( json_file: Union[str, None] = None, json_obj: Union[dict[str, Any], None] = None ): @@ -39,30 +92,31 @@ def main( :rtype: typing.Dict[str, typing.Dict[str, typing.Any]] :return: the evaluation results (will be empty if ``--do_eval`` not passed) """ - parser = HfArgumentParser((DaptArguments,)) + parser = HfArgumentParser((DaptArguments, TrainingArguments)) dapt_args: DaptArguments + training_args: TrainingArguments if json_file is not None and json_obj is not None: raise ValueError("cannot specify json_file and json_obj") if json_file is not None: - (dapt_args,) = parser.parse_json_file(json_file=json_file) + (dapt_args, training_args) = parser.parse_json_file(json_file=json_file) elif json_obj is not None: - (dapt_args,) = parser.parse_dict(json_obj) + (dapt_args, training_args) = parser.parse_dict(json_obj) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - (dapt_args,) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + (dapt_args, training_args) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - (dapt_args,) = parser.parse_args_into_dataclasses() + (dapt_args, training_args) = parser.parse_args_into_dataclasses() if ( - os.path.exists(dapt_args.output_dir) - and os.listdir(dapt_args.output_dir) - and not dapt_args.overwrite_output_dir + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and not training_args.overwrite_output_dir ): raise ValueError( - f"Output directory ({dapt_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging @@ -85,9 +139,10 @@ def main( # logger.info("Model parameters %s" % model_args) logger.info(f"Domain adaptation parameters {dapt_args}") + logger.info(f"Training arguments {training_args}") # Set seed - set_seed(dapt_args.seed) + set_seed(training_args.seed) # Load tokenizer: Need this first for loading the datasets tokenizer = AutoTokenizer.from_pretrained( @@ -101,13 +156,15 @@ def main( # additional_special_tokens=['', '', '', '', '', '', '', ''] ) - model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name) + # model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name) + config = AutoConfig.from_pretrained(dapt_args.encoder_name) + model = DaptModel(config) dataset = DaptDataset(dapt_args, tokenizer=tokenizer) trainer = Trainer( model=model, - args=TrainingArguments(output_dir=dapt_args.output_dir), + args=training_args, train_dataset=dataset.train, eval_dataset=dataset.test if not dapt_args.no_eval else None, data_collator=dataset.data_collator, diff --git a/src/cnlpt/train_system.py b/src/cnlpt/train_system.py index bdb335be..7caa2f1d 100644 --- a/src/cnlpt/train_system.py +++ b/src/cnlpt/train_system.py @@ -300,7 +300,7 @@ def main( if data_args.weight_classes: from collections import Counter - class_weights = [] + class_weights = {} for task in task_names: # get labels in the right order ([0, 1]) if isinstance( @@ -309,17 +309,21 @@ def main( dataset.tasks_to_labels[task] = dataset.tasks_to_labels[task][1:] + [ dataset.tasks_to_labels[task][0] ] - labels = dataset.processed_dataset["train"][task] + if tagger[task]: + labels = [token_label for sent in dataset.processed_dataset["train"][task] for token_label in sent.split()] + else: + labels = dataset.processed_dataset["train"][task] weights = [] label_counts = Counter(labels) for label in dataset.tasks_to_labels[task]: - weights.append(len(labels) / (num_labels[task] * label_counts[label])) + count = max(label_counts[label], 1) + weights.append(len(labels) / (num_labels[task] * count)) # class weights are determined by severity of class imbalance if len(task_names) > 1: - class_weights.append(weights) + class_weights[task] = torch.tensor(weights).to(training_args.device) else: - class_weights = weights # if we just have the one class, simplify the tensor or pytorch will be mad - class_weights = torch.tensor(class_weights).to(training_args.device) + class_weights = torch.tensor(weights).to(training_args.device) # if we just have the one class, simplify the tensor or pytorch will be mad + # class_weights = torch.tensor(class_weights).to(training_args.device) # sm = torch.nn.Softmax(dim=class_weights.ndim - 1) # class_weights = sm(class_weights) @@ -446,6 +450,7 @@ def main( # TODO check when download any pretrained language model to local disk, if # the following condition "is_hub_model(encoder_name)" works or not. + # ^ is_hub_model and is_external_encoder both return False, as long as "model_type": "cnlpt" is in config.json if not is_external_encoder(encoder_name): # we are loading one of our own trained models as a starting point. # @@ -459,7 +464,6 @@ def main( # the model file to be loaded down below the normal way. since that temp file # doesn't have a stored classifier it will use the randomly-inited classifier head # with the size of the supplied config (for the new task). - # TODO This setting 1) is not tested yet. # 2) if training_args.do_train is false: # we evaluate or make predictions of our trained models. # Both two setting require the registeration of CnlpConfig, and use @@ -468,6 +472,11 @@ def main( # Load the cnlp configuration using AutoConfig, this will not override # the arguments from trained cnlp models. While using CnlpConfig will override # the model_type and model_name of the encoder. + if model_args.keep_existing_classifiers == model_args.ignore_existing_classifiers: # XNOR + raise ValueError( + "For continued training of a cnlpt model, one of --keep_existing_classifiers or --ignore_existing_classifiers flags should be selected." + ) + config = AutoConfig.from_pretrained( ( model_args.config_name @@ -477,41 +486,56 @@ def main( cache_dir=model_args.cache_dir, # in this case we're looking at a fine-tuned model (?) character_level=data_args.character_level, + layer=model_args.layer, ) - if training_args.do_train: # Setting 1) only load weights from the encoder - raise NotImplementedError( - "This functionality has not been restored yet" - ) + if model_args.ignore_existing_classifiers: + config.finetuning_task = ( + data_args.task_name + if data_args.task_name is not None + else dataset.tasks + ) + elif model_args.keep_existing_classifiers: + # setting 2) evaluate or make predictions + if ( + config.finetuning_task != data_args.task_name + or config.relations != relations + or config.tagger != tagger + ): + raise ValueError( + "When --keep_existing_classifiers is selected, please ensure" + "that you set the settings the same as those used in the" + "previous training run." + ) + model = CnlpModelForClassification( - model_path=model_args.encoder_name, config=config, - cache_dir=model_args.cache_dir, - tagger=tagger, - relations=relations, - class_weights=dataset.class_weights, + # class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, - use_prior_tasks=model_args.use_prior_tasks, - argument_regularization=model_args.arg_reg, ) - delattr(model, "classifiers") - delattr(model, "feature_extractors") + if model_args.ignore_existing_classifiers: + model.remove_task_classifiers() + for task in data_args.task_name: + model.add_task_classifier(task, dataset.get_labels()[task]) + model.set_class_weights(dataset.class_weights) + if training_args.do_train: tempmodel = tempfile.NamedTemporaryFile(dir=model_args.cache_dir) torch.save(model.state_dict(), tempmodel) model_name = tempmodel.name - else: + else: # load existing head # setting 2) evaluate or make predictions model = CnlpModelForClassification.from_pretrained( model_args.encoder_name, config=config, - class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, ) - + model.tasks = data_args.task_name else: # This only works when model_args.encoder_name is one of the # model card from https://huggingface.co/models @@ -541,7 +565,7 @@ def main( config.vocab_size = len(tokenizer) model = CnlpModelForClassification( config=config, - class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, @@ -656,15 +680,22 @@ def compute_metrics_fn(p: EvalPrediction): raise RuntimeError( f"Unrecognized label type: {type(training_args.model_selection_label)}" ) - else: # same default as in 0.6.0 + elif dataset.output_modes[task] == relex: task_scores.append( metrics[task_name].get( "one_score", np.mean(metrics[task_name].get("f1")) ) ) + else: + task_scores.append( + metrics[task_name].get( + "one_score", np.mean(metrics[task_name].get("token_f1")) + ) + ) # task_scores.append(processor.get_one_score(metrics.get(task_name, metrics.get(task_name.split('-')[0], None)))) one_score = sum(task_scores) / len(task_scores) + metrics["one_score"] = one_score if model is not None: if not hasattr(model, "best_score") or one_score > model.best_score: @@ -675,7 +706,7 @@ def compute_metrics_fn(p: EvalPrediction): model.best_eval_results = metrics if trainer.is_world_process_zero(): if training_args.do_train: - trainer.save_model() + trainer.save_model() # NOTE: a RobertaConfig is loaded here. why? tokenizer.save_pretrained(training_args.output_dir) if model_name == "cnn" or model_name == "lstm": with open( @@ -690,7 +721,7 @@ def compute_metrics_fn(p: EvalPrediction): ) config_dict["task_names"] = task_names json.dump(config_dict, f) - for task_ind, task_name in enumerate(metrics): + for task_ind, task_name in enumerate(task_names): with open(output_eval_file, "a") as writer: logger.info( f"***** Eval results for task {task_name} *****" @@ -720,7 +751,8 @@ def compute_metrics_fn(p: EvalPrediction): return compute_metrics_fn # Initialize our Trainer - training_args.load_best_model_at_end = True + # training_args.load_best_model_at_end = True + # TODO the argument in CnlpTrainingArguments is `model_selection_score`. reconcile this with `metric_for_best_model`? training_args.metric_for_best_model = "one_score" trainer = Trainer( model=model, @@ -884,7 +916,7 @@ def compute_metrics_fn(p: EvalPrediction): out_table = process_prediction( task_names=dataset.tasks, - error_analysis=False, + error_analysis=training_args.error_analysis, output_prob=training_args.output_prob, character_level=data_args.character_level, task_to_label_packet=task_to_label_packet, @@ -910,4 +942,4 @@ def _mp_fn(index): if __name__ == "__main__": - main() + main() \ No newline at end of file From 02a22ee943dd89e500bedea771f8e57f276248ba Mon Sep 17 00:00:00 2001 From: Spencer Thomas Date: Tue, 29 Apr 2025 15:35:27 -0400 Subject: [PATCH 2/5] make the training arguments in dapt.py CnlpTrainingArguments to enable our special arguments --- src/cnlpt/dapt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py index 4e85679b..49f75ea6 100644 --- a/src/cnlpt/dapt.py +++ b/src/cnlpt/dapt.py @@ -22,7 +22,7 @@ from transformers.modeling_utils import PreTrainedModel from .CnlpModelForClassification import CnlpConfig, generalize_encoder_forward_kwargs -from .cnlp_args import DaptArguments +from .cnlp_args import DaptArguments, CnlpTrainingArguments from .cnlp_data import DaptDataset logger = logging.getLogger(__name__) @@ -92,9 +92,9 @@ def main( :rtype: typing.Dict[str, typing.Dict[str, typing.Any]] :return: the evaluation results (will be empty if ``--do_eval`` not passed) """ - parser = HfArgumentParser((DaptArguments, TrainingArguments)) + parser = HfArgumentParser((DaptArguments, CnlpTrainingArguments)) dapt_args: DaptArguments - training_args: TrainingArguments + training_args: CnlpTrainingArguments if json_file is not None and json_obj is not None: raise ValueError("cannot specify json_file and json_obj") From 3108b1d6dc55fb9d59a9e7a5734d4071c9a49f75 Mon Sep 17 00:00:00 2001 From: spencerthomas1722 Date: Wed, 30 Apr 2025 12:26:44 -0400 Subject: [PATCH 3/5] fix small syntax issue --- src/cnlpt/cnlp_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py index e2033032..0124a6c2 100644 --- a/src/cnlpt/cnlp_data.py +++ b/src/cnlpt/cnlp_data.py @@ -17,8 +17,8 @@ from transformers import BatchEncoding, DataCollatorForLanguageModeling from transformers.tokenization_utils import PreTrainedTokenizer -from cnlp_args import DaptArguments -from cnlp_processors import AutoProcessor, classification, relex, tagging +from .cnlp_args import DaptArguments +from .cnlp_processors import AutoProcessor, classification, relex, tagging special_tokens = ["", "", "", "", "", "", "", ""] text_columns = ["text", "text_a", "text_b"] From a380cbd6353f5d735701bc0a93443d99f0e0e951 Mon Sep 17 00:00:00 2001 From: spencerthomas1722 Date: Wed, 30 Apr 2025 12:29:26 -0400 Subject: [PATCH 4/5] enable freeze for dapt --- src/cnlpt/dapt.py | 8 ++++++-- src/cnlpt/train_system.py | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py index 49f75ea6..09c9afaa 100644 --- a/src/cnlpt/dapt.py +++ b/src/cnlpt/dapt.py @@ -21,7 +21,7 @@ from transformers.modeling_outputs import MaskedLMOutput from transformers.modeling_utils import PreTrainedModel -from .CnlpModelForClassification import CnlpConfig, generalize_encoder_forward_kwargs +from .CnlpModelForClassification import CnlpConfig, freeze_encoder_weights, generalize_encoder_forward_kwargs from .cnlp_args import DaptArguments, CnlpTrainingArguments from .cnlp_data import DaptDataset @@ -35,6 +35,7 @@ class DaptModel(PreTrainedModel): def __init__( self, config: config_class, + freeze: float = -1.0, ): super().__init__(config) encoder_config = AutoConfig.from_pretrained(config._name_or_path) @@ -45,6 +46,9 @@ def __init__( # if not config.character_level: self.encoder.resize_token_embeddings(encoder_config.vocab_size) + if freeze > 0: + freeze_encoder_weights(self.encoder.bert.encoder, freeze) + def forward( self, input_ids, @@ -158,7 +162,7 @@ def main( # model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name) config = AutoConfig.from_pretrained(dapt_args.encoder_name) - model = DaptModel(config) + model = DaptModel(config, freeze=training_args.freeze) dataset = DaptDataset(dapt_args, tokenizer=tokenizer) diff --git a/src/cnlpt/train_system.py b/src/cnlpt/train_system.py index 7caa2f1d..d75489dd 100644 --- a/src/cnlpt/train_system.py +++ b/src/cnlpt/train_system.py @@ -514,6 +514,7 @@ def main( # class_weights=dataset.class_weights, class_weights=class_weights, final_task_weight=training_args.final_task_weight, + freeze=training_args.freeze, ) if model_args.ignore_existing_classifiers: model.remove_task_classifiers() From e4ffc917f1f1e455d32c19a40846ae0c95d3ff65 Mon Sep 17 00:00:00 2001 From: Spencer Thomas Date: Thu, 12 Jun 2025 14:42:06 -0400 Subject: [PATCH 5/5] remove unnecessary import + comments --- src/cnlpt/cnlp_data.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py index 0124a6c2..12960788 100644 --- a/src/cnlpt/cnlp_data.py +++ b/src/cnlpt/cnlp_data.py @@ -7,8 +7,6 @@ from enum import Enum from typing import Union -import pdb - import datasets import numpy as np from datasets import Dataset as HFDataset @@ -1159,12 +1157,7 @@ def __init__( ) dataset = dataset.remove_columns("word_ids") - # dataset = dataset.map( - # functools.partial(group_texts, self.args.chunk_size), - # batched=True, - # ) - - + if isinstance(dataset, (DatasetDict, IterableDatasetDict)) or args.no_eval: self.dataset = dataset