Merge pull request huggingface#3 from kaixuanliu/ipex

yuanwu2017 · web-flow · commit 09a0d3fbede8 · 2024-08-15T16:18:10.000+08:00
add XPU and HPU support
diff --git a/Cargo.toml b/Cargo.toml
@@ -29,6 +29,7 @@ tracing = "0.1"
 serde = { version = "1.0", features = ["serde_derive"] }
 serde_json = "1.0"
 thiserror = "1.0"
+rand = "0.8"
 
 
 [patch.crates-io]
diff --git a/Dockerfile-intel b/Dockerfile-intel
@@ -1,6 +1,6 @@
+ARG PLATFORM=cpu
 FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
 WORKDIR /usr/src
-
 ENV SCCACHE=0.5.4
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
 
@@ -54,8 +54,7 @@ COPY proto proto
 
 RUN cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s
 
-FROM intel/intel-optimized-pytorch:2.3.0-pip-base as base
-
+FROM intel/intel-optimized-pytorch:2.4.0-pip-base AS cpu
 ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
@@ -72,26 +71,79 @@ COPY backends backends
 COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
 COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
 COPY backends/python/server/requirements-intel.txt backends/python/server/requirements.txt
+
+RUN python -m pip install torch==2.4.0 torchvision torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
+
 RUN cd backends/python/server && \
     make install
 
-RUN python -m pip install torch==2.4.0 torchvision torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/test/cpu
-RUN python -m pip uninstall -y intel-extension-for-pytorch
-RUN git clone https://github.com/intel/intel-extension-for-pytorch.git &&\
-    cd intel-extension-for-pytorch &&\
-    git reset --hard 620a9bfd9db42813931a857e78fa3f5d298be200 &&\
-    git submodule sync &&\
-    git submodule update --init --recursive &&\
-    python setup.py install
+FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest AS hpu
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        git \
+        cmake \
+        ninja-build \
+        python3-dev &&\
+        rm -rf /var/lib/apt/lists/*
+
+WORKDIR /usr/src
+COPY backends backends
+COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
+COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
+COPY backends/python/server/requirements-hpu.txt backends/python/server/requirements.txt
+
+RUN cd backends/python/server && \
+    make install
+
+FROM intel/intel-extension-for-pytorch:2.1.40-xpu AS xpu
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
+WORKDIR /usr/src
+RUN pip install PyYAML
+RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl && pip install torch-2.1.0.post1+cxx11.abi-cp310-cp310-linux_x86_64.whl
+RUN pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b distributed origin/dev/distributed
+
+ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
+ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
+ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
+ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV CCL_ZE_IPC_EXCHANGE=sockets
+ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
+ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
+
+RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
+
+COPY backends backends
+COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
+COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
+COPY backends/python/server/requirements-intel.txt backends/python/server/requirements.txt
+RUN cd backends/python/server && \
+    make install
 
-FROM base as grpc
+FROM ${PLATFORM} AS grpc
 
 COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 
 ENTRYPOINT ["text-embeddings-router"]
 CMD ["--json-output"]
 
-FROM base
+FROM ${PLATFORM}
 
 COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
@@ -15,6 +15,7 @@ text-embeddings-backend-candle = { path = "candle", optional = true }
 text-embeddings-backend-ort = { path = "ort", optional = true }
 tokio = { workspace = true }
 tracing = { workspace = true }
+rand = { workspace = true }
 
 [features]
 clap = ["dep:clap", "text-embeddings-backend-core/clap"]
diff --git a/backends/python/server/requirements-hpu.txt b/backends/python/server/requirements-hpu.txt
@@ -0,0 +1,60 @@
+backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.13.4 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.2.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec[http]==2024.2.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.22.2 ; python_version >= "3.9" and python_version < "3.13"
+humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
+jinja2==3.1.3 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.13"
+mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
+networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+optimum-habana==1.12.0 ; python_version >= "3.9" and python_version < "3.13"
+optimum==1.20.0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
+pandas==2.2.2 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.24.3 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.4.16 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.1 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==68.2.0 ; python_version >= "3.9" and python_version < "3.13"
+six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers[sentencepiece]==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
+typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
+tzdata==2024.1 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+xxhash==3.4.1 ; python_version >= "3.9" and python_version < "3.13"
+yarl==1.9.4 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.18.1 ; python_version >= "3.9" and python_version < "3.13"
+pyrsistent==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/backends/python/server/requirements-intel.txt b/backends/python/server/requirements-intel.txt
@@ -40,4 +40,5 @@ typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.40.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.40.0 ; python_version >= "3.9" and python_version < "3.13"
+pyrsistent==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -1,3 +1,4 @@
+from pyrsistent import s
 import torch
 
 from loguru import logger
@@ -25,38 +26,36 @@
     __all__.append(FlashBert)
 
 
-def get_model(model_path: Path, dtype: Optional[str]):
+def get_model(model_path: Path, dtype: Optional[str]) :
     if dtype == "float32":
-        dtype = torch.float32
+        datatype = torch.float32
     elif dtype == "float16":
-        dtype = torch.float16
+        datatype = torch.float16
     elif dtype == "bfloat16":
-        dtype = torch.bfloat16
+        datatype = torch.bfloat16
     else:
         raise RuntimeError(f"Unknown dtype {dtype}")
 
     device = get_device()
     config = AutoConfig.from_pretrained(model_path)
-
     if config.model_type == "bert":
         config: BertConfig
         if (
             device.type == "cuda"
             and config.position_embedding_type == "absolute"
-            and dtype in [torch.float16, torch.bfloat16]
+            and datatype in [torch.float16, torch.bfloat16]
             and FLASH_ATTENTION
         ):
-            return FlashBert(model_path, device, dtype)
-        elif (
-              device.type == "cpu"
-              and use_ipex()
-        ):
-            logger.info("Use the flashBert for CPU")
-            return FlashBert(model_path, device, dtype)
-        else:
-            return DefaultModel(model_path, device, dtype)
+            return FlashBert(model_path, device, datatype) # type: ignore
+        if use_ipex() and device.type in ["cpu", "xpu"]:
+            return FlashBert(model_path, device, datatype) # type: ignore
+        if device.type == "hpu":
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+            from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+            adapt_transformers_to_gaudi()
+            model_handle = DefaultModel(model_path, device, datatype)
+            model_handle.model = wrap_in_hpu_graph(model_handle.model, disable_tensor_cache=True)
+            return model_handle
+        return DefaultModel(model_path, device, datatype)
     else:
-        try:
-            return DefaultModel(model_path, device, dtype)
-        except:
-            raise RuntimeError(f"Unsupported model_type {config.model_type}")
+        return DefaultModel(model_path, device, datatype)
diff --git a/backends/python/server/text_embeddings_server/models/flash_bert.py b/backends/python/server/text_embeddings_server/models/flash_bert.py
@@ -12,7 +12,7 @@
 from text_embeddings_server.models import Model
 from text_embeddings_server.models.types import FlashBatch, Embedding
 from text_embeddings_server.utils.flash_attn import attention
-
+from text_embeddings_server.utils.device import use_ipex
 tracer = trace.get_tracer(__name__)
 
 
@@ -25,6 +25,8 @@ def __init__(self, prefix, handle, device, dtype, config: BertConfig):
 
     def forward(self, hidden_states, residual=None):
         # Flash attention imports
+        normed_hidden_states = None
+        res = None
         if self.device.type == "cuda":
             import dropout_layer_norm
             normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
@@ -46,7 +48,7 @@ def forward(self, hidden_states, residual=None):
             )
             if res is None:
                 res = hidden_states
-        else:
+        elif use_ipex():
             import intel_extension_for_pytorch as ipex
             normed_hidden_states = ipex.llm.functional.add_layer_norm(
                 residual,
diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
@@ -1,20 +1,21 @@
 import os
-from loguru import logger
+from loguru import logger # type: ignore
 import importlib
 from packaging import version
 import torch
+import subprocess
 
-def is_ipex_available():
+def _is_ipex_available():
     def get_major_and_minor_from_version(full_version):
         return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
 
-    _torch_version = importlib.metadata.version("torch")
-    if importlib.util.find_spec("intel_extension_for_pytorch") is None:
+    _torch_version = importlib.metadata.version("torch") # type: ignore
+    if importlib.util.find_spec("intel_extension_for_pytorch") is None: # type: ignore
         return False
     _ipex_version = "N/A"
     try:
-        _ipex_version = importlib.metadata.version("intel_extension_for_pytorch")
-    except importlib.metadata.PackageNotFoundError:
+        _ipex_version = importlib.metadata.version("intel_extension_for_pytorch") # type: ignore
+    except importlib.metadata.PackageNotFoundError: # type: ignore
         return False
     torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
     ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
@@ -26,22 +27,29 @@ def get_major_and_minor_from_version(full_version):
         return False
     return True
 
-def use_ipex() :
+def _is_hpu() -> bool:
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        is_hpu_available = False
+    return is_hpu_available
+
+def use_ipex() -> bool:
     value = os.environ.get("USE_IPEX", "True").lower()
-    if value in ["true", "1"] and is_ipex_available():
-        return True
-    else:
-        return False
+    return (value in ["true", "1"] and _is_ipex_available())
 
 def get_device() :
+    device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda")
-    elif is_ipex_available():
+    elif _is_hpu():
+        import habana_frameworks.torch.core as htcore
+        if hasattr(torch, "hpu") and torch.hpu.is_available(): # type: ignore
+            device = torch.device("hpu")
+    elif use_ipex():
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             device = torch.device("xpu")
-        else:
-            device = torch.device("cpu")
-    else:
-        device = torch.device("cpu")
+
     return device
 
diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py
@@ -58,7 +58,10 @@ def attention(q, k, v, out, cu_seqlens, max_s, softmax_scale, is_causal=False):
     if HAS_FLASH_ATTN_V2:
         if use_ipex():
             import intel_extension_for_pytorch as ipex
-            return ipex.llm.functional.varlen_attention(q, k, v, out, cu_seqlens, cu_seqlens, max_s, max_s, 0, softmax_scale, zero_tensors=False, is_causal=False, return_softmax=False, gen_=None)
+            return ipex.llm.functional.varlen_attention(q, k, v, out, cu_seqlens, cu_seqlens,
+                                                        max_s, max_s, 0, softmax_scale,
+                                                        zero_tensors=False, is_causal=False,
+                                                        return_softmax=False, gen_=None)
         else:
             return flash_attn_2_cuda.varlen_fwd(
                 q,
diff --git a/backends/src/lib.rs b/backends/src/lib.rs