Merge pull request #217 from KhiopsML/216-coreapi-does-not-detect-unknown-parameters

folmos-at-orange · web-flow · commit 5f3137534112 · 2024-07-31T18:10:22.000+02:00
Fix unknown arguments being silently ignored
diff --git a/khiops/core/api.py b/khiops/core/api.py
@@ -299,6 +299,11 @@ def _preprocess_task_arguments(task_args):
                 )
             )
 
+    # Flatten kwargs
+    if "kwargs" in task_args:
+        task_args.update(task_args["kwargs"])
+        del task_args["kwargs"]
+
     return task_called_with_domain
 
 
@@ -336,10 +341,10 @@ def _preprocess_format_spec(detect_format, header_line, field_separator):
 def _clean_task_args(task_args):
     """Cleans the task arguments
 
-    More precisely:
-        - It removes command line arguments (they already are in another object).
-        - It removes parameters removed from the API and warns about it.
-        - It removes renamed API parameters and warns about it.
+    More precisely it removes:
+        - Command line arguments (they already are in another object).
+        - Parameters removed from the API and warns about it.
+        - Renamed API parameters and warns about it.
     """
     # Remove non-task parameters
     command_line_arg_names = [
@@ -353,7 +358,6 @@ def _clean_task_args(task_args):
         "trace",
         "stdout_file_path",
         "stderr_file_path",
-        "kwargs",
     ]
     for arg_name in command_line_arg_names + other_arg_names:
         if arg_name in task_args:
diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -1464,9 +1464,11 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir):
         # Build the optional parameters from a copy of the estimator parameters
         kwargs = self.get_params()
 
-        # Remove 'key' and 'output_dir'
+        # Remove non core.api params
         del kwargs["key"]
         del kwargs["output_dir"]
+        del kwargs["auto_sort"]
+        del kwargs["internal_sort"]
 
         # Set the sampling percentage to a 100%
         kwargs["sample_percentage"] = 100
diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py
@@ -166,11 +166,27 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
                 y,
                 categorical_target=categorical_target,
             )
-        # A sparse matrix
+        # A scipy.sparse.spmatrix
         elif isinstance(X, sp.spmatrix):
             self._init_tables_from_sparse_matrix(
                 X, y, categorical_target=categorical_target
             )
+        # Special rejection for scipy.sparse.sparray (to pass the sklearn tests)
+        # Note: We don't use scipy.sparse.sparray because it is not implemented in scipy
+        # 1.10 which is the latest supporting py3.8
+        elif isinstance(
+            X,
+            (
+                sp.bsr_array,
+                sp.coo_array,
+                sp.csc_array,
+                sp.csr_array,
+                sp.dia_array,
+                sp.dok_array,
+                sp.lil_array,
+            ),
+        ):
+            check_array(X, accept_sparse=False)
         # A tuple spec
         elif isinstance(X, tuple):
             warnings.warn(
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -102,11 +102,11 @@ def test_analysis_results(self):
                     with self.assertWarns(UserWarning):
                         results = kh.read_analysis_results_file(ref_json_report)
                         results.write_report_file(output_report)
-                        files_equal_or_fail(ref_report, output_report)
+                        assert_files_equal(self, ref_report, output_report)
                 else:
                     results = kh.read_analysis_results_file(ref_json_report)
                     results.write_report_file(output_report)
-                    files_equal_or_fail(ref_report, output_report)
+                    assert_files_equal(self, ref_report, output_report)
 
     def test_coclustering_results(self):
         """Tests for the coclustering_results module"""
@@ -149,7 +149,7 @@ def test_coclustering_results(self):
                 else:
                     results = kh.read_coclustering_results_file(ref_json_report)
                 results.write_report_file(output_report)
-                files_equal_or_fail(ref_report, output_report)
+                assert_files_equal(self, ref_report, output_report)
                 for dimension in results.coclustering_report.dimensions:
                     ref_hierarchy_report = os.path.join(
                         ref_reports_dir, f"{report}_hierarchy_{dimension.name}.txt"
@@ -160,7 +160,9 @@ def test_coclustering_results(self):
                     dimension.write_hierarchy_structure_report_file(
                         output_hierarchy_report
                     )
-                    files_equal_or_fail(ref_hierarchy_report, output_hierarchy_report)
+                    assert_files_equal(
+                        self, ref_hierarchy_report, output_hierarchy_report
+                    )
 
     def test_binary_dictionary_domain(self):
         """Test binary dictionary write"""
@@ -204,13 +206,13 @@ def test_binary_dictionary_domain(self):
         for domain in (domain_from_api, domain_from_json):
             # Dump domain object as kdic file and compare it to the reference
             domain.export_khiops_dictionary_file(output_kdic)
-            files_equal_or_fail(ref_kdic, output_kdic)
+            assert_files_equal(self, ref_kdic, output_kdic)
 
             # Make a copy of the domain object, then dump it as kdic file and
             # compare it to the reference
             domain_copy = domain.copy()
             domain_copy.export_khiops_dictionary_file(copy_output_kdic)
-            files_equal_or_fail(ref_kdic, copy_output_kdic)
+            assert_files_equal(self, ref_kdic, copy_output_kdic)
 
     def test_dictionary(self):
         """Tests for the dictionary module"""
@@ -276,19 +278,13 @@ def test_dictionary(self):
                 else:
                     domain = kh.read_dictionary_file(ref_kdicj)
                 domain.export_khiops_dictionary_file(output_kdic)
-                files_equal_or_fail(ref_kdic, output_kdic)
+                assert_files_equal(self, ref_kdic, output_kdic)
 
                 domain_copy = domain.copy()
                 domain_copy.export_khiops_dictionary_file(copy_output_kdic)
-                files_equal_or_fail(ref_kdic, copy_output_kdic)
-
-    def test_api_scenario_generation(self):
-        """Tests the scenarios generated by the API
+                assert_files_equal(self, ref_kdic, copy_output_kdic)
 
-        These tests are not exhaustive, executed with the minimal parameters to trigger
-        the more complex scenario generation code (lists, key-value sections) when they
-        are present.
-        """
+    def _build_mock_api_method_parameters(self):
         # Pseudo-mock data to test the creation of scenarios
         datasets = ["Adult", "SpliceJunction", "Customer"]
         additional_data_tables = {
@@ -571,6 +567,15 @@ def test_api_scenario_generation(self):
             },
         }
 
+        return method_test_args
+
+    def test_api_scenario_generation(self):
+        """Tests the scenarios generated by the API
+
+        These tests are not exhaustive, executed with the minimal parameters to trigger
+        the more complex scenario generation code (lists, key-value sections) when they
+        are present.
+        """
         # Set the root directory of these tests
         test_resources_dir = os.path.join(resources_dir(), "scenario_generation", "api")
 
@@ -580,44 +585,69 @@ def test_api_scenario_generation(self):
         kh.set_runner(test_runner)
 
         # Run test for all methods and all mock datasets parameters
+        method_test_args = self._build_mock_api_method_parameters()
         for method_name, method_full_args in method_test_args.items():
-            self._test_method_scenario_generation(
-                test_runner,
-                method_name,
-                method_full_args,
-            )
+            # Set the runners test name
+            test_runner.test_name = method_name
+
+            # Clean the directory for this method's tests
+            cleanup_dir(test_runner.output_scenario_dir, "*/output/*._kh", verbose=True)
+
+            # Test for each dataset mock parameters
+            for dataset, dataset_method_args in method_full_args.items():
+                test_runner.subtest_name = dataset
+                with self.subTest(dataset=dataset, method=method_name):
+                    # Execute the method
+                    method = getattr(kh, method_name)
+                    dataset_args = dataset_method_args["args"]
+                    dataset_kwargs = dataset_method_args["kwargs"]
+                    method(*dataset_args, **dataset_kwargs)
+
+                    # Compare the reference with the output
+                    assert_files_equal(
+                        self,
+                        test_runner.ref_scenario_path,
+                        test_runner.output_scenario_path,
+                        line_comparator=scenario_line_comparator,
+                    )
 
         # Restore the default runner
         kh.set_runner(default_runner)
 
-    def _test_method_scenario_generation(
-        self,
-        runner,
-        method_name,
-        method_full_args,
-    ):
-        # Set the runners test name
-        runner.test_name = method_name
-
-        # Clean the directory for this method's tests
-        cleanup_dir(runner.output_scenario_dir, "*/output/*._kh", verbose=True)
+    def test_unknown_argument_in_api_method(self):
+        """Tests if core.api raises ValueError when an unknown argument is passed"""
+        # Obtain mock arguments for each API call
+        method_test_args = self._build_mock_api_method_parameters()
 
         # Test for each dataset mock parameters
-        for dataset, dataset_method_args in method_full_args.items():
-            runner.subtest_name = dataset
-            with self.subTest(dataset=dataset, method=method_name):
-                # Execute the method
-                method = getattr(kh, method_name)
-                dataset_args = dataset_method_args["args"]
-                dataset_kwargs = dataset_method_args["kwargs"]
-                method(*dataset_args, **dataset_kwargs)
-
-                # Compare the reference with the output
-                files_equal_or_fail(
-                    runner.ref_scenario_path,
-                    runner.output_scenario_path,
-                    line_comparator=scenario_line_comparator,
-                )
+        for method_name, method_full_args in method_test_args.items():
+            for dataset, dataset_method_args in method_full_args.items():
+                # Test only for the Adult dataset
+                if dataset != "Adult":
+                    continue
+
+                with self.subTest(method=method_name):
+                    # These methods do not have kwargs so they cannot have extra args
+                    if method_name in [
+                        "detect_data_table_format",
+                        "export_dictionary_as_json",
+                    ]:
+                        continue
+
+                    # Execute the method with an invalid parameter
+                    method = getattr(kh, method_name)
+                    dataset_args = dataset_method_args["args"]
+                    dataset_kwargs = dataset_method_args["kwargs"]
+                    dataset_kwargs["INVALID_PARAM"] = False
+
+                    # Check that the call raised ValueError
+                    with self.assertRaises(ValueError) as context:
+                        method(*dataset_args, **dataset_kwargs)
+
+                    # Check the message
+                    expected_msg = "Unknown argument 'INVALID_PARAM'"
+                    output_msg = str(context.exception)
+                    self.assertEqual(output_msg, expected_msg)
 
     def test_general_options(self):
         """Test that the general options are written to the scenario file"""
@@ -642,7 +672,8 @@ def test_general_options(self):
         kh.check_database("a.kdic", "dict_name", "data.txt")
 
         # Compare the reference with the output
-        files_equal_or_fail(
+        assert_files_equal(
+            self,
             test_runner.ref_scenario_path,
             test_runner.output_scenario_path,
             line_comparator=scenario_line_comparator,
@@ -2075,8 +2106,8 @@ def find_first_different_byte(ref_line, output_line):
     return first_diff_pos, first_diff_ref_byte, first_diff_output_byte
 
 
-def files_equal_or_fail(
-    ref_file_path, output_file_path, line_comparator=default_line_comparator
+def assert_files_equal(
+    test_suite, ref_file_path, output_file_path, line_comparator=default_line_comparator
 ):
     """Portably tests if two files are equal by comparing line-by-line"""
     # Read all lines from the files
@@ -2089,7 +2120,7 @@ def files_equal_or_fail(
     ref_file_len = len(ref_file_lines)
     output_file_len = len(output_file_lines)
     if ref_file_len != output_file_len:
-        raise ValueError(
+        test_suite.fail(
             "Files have different number of lines\n"
             + f"Ref file           : {shorten_path(ref_file_path, 5)}\n"
             + f"Output file        : {shorten_path(output_file_path, 5)}\n"