Fix itrex qbits nf4/int8 training core dumped issue (#1954)

Kaihui-intel · web-flow · commit 190e6b2be6b3 · 2024-07-29T19:39:57.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
Signed-off-by: chensuyue &lt;suyue.chen@intel.com&gt;
diff --git a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
@@ -72,6 +72,7 @@ FRAMEWORK="pytorch"
 source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh 'latest'
 if [[ "${inc_new_api}" == "3x"* ]]; then
     FRAMEWORK_VERSION="latest"
+    export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 else
     FRAMEWORK_VERSION=${pytorch_version}
     TORCH_VISION_VERSION=${torchvision_version}
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh
@@ -5,6 +5,7 @@ echo "${test_case}"
 
 # install requirements
 echo "set up UT env..."
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 pip install -r /neural-compressor/test/3x/torch/requirements.txt
 pip install pytest-cov
 pip install pytest-html
diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
@@ -6,6 +6,7 @@ echo "run itrex ut..."
 
 # install inc 3x deps
 pip install -r /neural-compressor/requirements_pt.txt
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 
 # prepare itrex
 git clone https://github.com/intel/intel-extension-for-transformers.git /intel-extension-for-transformers
diff --git a/.azure-pipelines/ut-itrex.yml b/.azure-pipelines/ut-itrex.yml
@@ -13,10 +13,6 @@ pr:
       - requirements.txt
       - .azure-pipelines/scripts/ut/run_itrex.sh
       - .azure-pipelines/ut-itrex.yml
-    exclude:
-      - neural_compressor/common
-      - neural_compressor/torch
-      - neural_compressor/tensorflow
 
 pool: MODEL_PERF_TEST
 
diff --git a/docs/source/faq.md b/docs/source/faq.md
@@ -17,3 +17,12 @@ ImportError: libGL.so.1: cannot open shared object file: No such file or directo
 #### Issue 4:  
 Conda package *neural-compressor-full* (this binary is only available from v1.13 to v2.1.1) dependency conflict may pending on conda installation for a long time.   
 **Solution:** run *conda install sqlalchemy=1.4.27 alembic=1.7.7 -c conda-forge* before install *neural-compressor-full*. 
+#### Issue 5: 
+If you run 3X torch extension API inside a docker container, then you may encounter the following error:  
+```shell
+ValueError: No threading layer could be loaded.
+HINT:
+Intel TBB is required, try:
+$ conda/pip install tbb
+```
+**Solution:** It's actually already installed by `requirements_pt.txt`, so just need to set up with `export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH`. 
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -607,6 +607,7 @@ def pack_array_with_numba(
 
         pack_method_name = f"pack_array_with_numba_b{bits}_c{compress_bits}"
         pack_method = getattr(self, pack_method_name)
+        numba.config.THREADING_LAYER = "safe"
         return pack_method(raw_array, packed_array, n_pack, new_in_features)
 
     def pack_tensor_with_numpy_impl(self, raw_tensor):
diff --git a/requirements_pt.txt b/requirements_pt.txt
@@ -5,3 +5,4 @@ prettytable
 psutil
 py-cpuinfo
 pydantic
+tbb