Skip to content

Commit 68a5236

Browse files
snadampalmalfet
andauthored
[aarch64] cherry-pick Xbyak crash fix and linter error fixes from main (#1649)
* cleanup mkldnn patching (#1630) pytorch is moved to oneDNN v3.3.2 and some of the old patches are not applicable any more. * Add `aarch64_linux` to the list of linted files * Actually fix lint this type * aarch64: patch mkl-dnn for xbyak crashes due to /sys not accessible There are platforms with /sys not mounted. skip handling HW caps for such platforms. cherry-pick of: uxlfoundation/oneDNN#1773 This fixes the issue# pytorch/pytorch#115482 --------- Co-authored-by: Nikita Shulga <[email protected]>
1 parent b5527e4 commit 68a5236

6 files changed

+132
-45
lines changed

.lintrunner.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ merge_base_with = "origin/main"
22

33
[[linter]]
44
code = 'RUFF'
5-
include_patterns = ['test/smoke_test/*.py', 's3_management/*.py']
5+
include_patterns = ['test/smoke_test/*.py', 's3_management/*.py', 'aarch64_linux/*.py']
66
command = [
77
'python3',
88
'tools/linter/adapters/ruff_linter.py',

aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# encoding: UTF-8
33

44
import os
5-
import subprocess
5+
from subprocess import check_output
66
from pygit2 import Repository
77
from typing import List
88

@@ -11,18 +11,20 @@ def list_dir(path: str) -> List[str]:
1111
''''
1212
Helper for getting paths for Python
1313
'''
14-
return subprocess.check_output(["ls", "-1", path]).decode().split("\n")
14+
return check_output(["ls", "-1", path]).decode().split("\n")
1515

1616

1717
def build_ArmComputeLibrary(git_clone_flags: str = "") -> None:
1818
'''
1919
Using ArmComputeLibrary for aarch64 PyTorch
2020
'''
2121
print('Building Arm Compute Library')
22+
acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
23+
"arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"])
2224
os.system("cd / && mkdir /acl")
2325
os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}")
2426
os.system("cd ComputeLibrary; export acl_install_dir=/acl; "
25-
"scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native build_dir=$acl_install_dir/build; "
27+
f"scons Werror=1 -j8 {acl_build_flags} build_dir=$acl_install_dir/build; "
2628
"cp -r arm_compute $acl_install_dir; "
2729
"cp -r include $acl_install_dir; "
2830
"cp -r utils $acl_install_dir; "
@@ -86,13 +88,12 @@ def parse_arguments():
8688
if override_package_version is not None:
8789
version = override_package_version
8890
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
89-
else:
90-
if branch in ['nightly', 'master']:
91-
build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
92-
version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
93-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
94-
if branch.startswith("v1.") or branch.startswith("v2."):
95-
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
91+
elif branch in ['nightly', 'master']:
92+
build_date = check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
93+
version = check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
94+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
95+
elif branch.startswith(("v1.", "v2.")):
96+
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
9697

9798
if enable_mkldnn:
9899
build_ArmComputeLibrary(git_clone_flags)
@@ -105,9 +106,10 @@ def parse_arguments():
105106
else:
106107
print("build pytorch without mkldnn backend")
107108

108-
# work around to fix Raspberry pie crash
109-
print("Applying mkl-dnn patch to fix readdir crash")
110-
os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-readdir-crash.patch")
109+
# patch mkldnn to fix aarch64 mac and aws lambda crash
110+
print("Applying mkl-dnn patch to fix crash due to /sys not accesible")
111+
os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/fix-xbyak-failure.patch")
112+
111113
os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
112114
pytorch_wheel_name = complete_wheel("pytorch")
113115
print(f"Build Compelete. Created {pytorch_wheel_name}..")

aarch64_linux/build_aarch64_wheel.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
# This script is for building AARCH64 wheels using AWS EC2 instances.
44
# To generate binaries for the release follow these steps:
5-
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: "v1.11.0": ("0.11.0", "rc1"),
6-
# 2. Run script with following arguments for each of the supported python versions and specify required RC tag for example: v1.11.0-rc3:
7-
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>
5+
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
6+
# "v1.11.0": ("0.11.0", "rc1"),
7+
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
8+
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
89

910

1011
import boto3
@@ -177,7 +178,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
177178
try:
178179
with socket.create_connection((addr, port), timeout=timeout):
179180
return
180-
except (ConnectionRefusedError, socket.timeout):
181+
except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
181182
if i == attempt_cnt - 1:
182183
raise
183184
time.sleep(timeout)
@@ -203,7 +204,7 @@ def install_condaforge(host: RemoteHost,
203204
if host.using_docker():
204205
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
205206
else:
206-
host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc'])
207+
host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc']) # noqa: E501
207208

208209

209210
def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
@@ -221,12 +222,13 @@ def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
221222
print('Building OpenBLAS')
222223
host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.25 {git_clone_flags}")
223224
make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
224-
host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS")
225+
host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") # noqa: E501
225226

226227

227228
def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
228229
print('Building Arm Compute Library')
229-
acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native"
230+
acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
231+
"arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"])
230232
host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}")
231233
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
232234

@@ -301,7 +303,7 @@ def build_torchvision(host: RemoteHost, *,
301303
# Remove .so files to force static linking
302304
host.run_cmd("rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so")
303305
# And patch setup.py to include libz dependency for libpng
304-
host.run_cmd(['sed -i -e \'s/image_link_flags\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'])
306+
host.run_cmd(['sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) # noqa: E501
305307

306308
build_vars = ""
307309
if branch == "nightly":
@@ -525,7 +527,7 @@ def start_build(host: RemoteHost, *,
525527
if host.using_docker():
526528
print("Move libgfortant.a into a standard location")
527529
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
528-
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'
530+
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501
529531
# Workaround by copying gfortran library from the host
530532
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
531533
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
@@ -543,22 +545,23 @@ def start_build(host: RemoteHost, *,
543545
# Breakpad build fails on aarch64
544546
build_vars = "USE_BREAKPAD=0 "
545547
if branch == 'nightly':
546-
build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "")
548+
build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "") # noqa: E501
547549
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
548550
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
549-
if branch.startswith("v1.") or branch.startswith("v2."):
551+
if branch.startswith(("v1.", "v2.")):
550552
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
551553
if host.using_docker():
552554
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
553555
if enable_mkldnn:
554556
build_ArmComputeLibrary(host, git_clone_flags)
555557
print("build pytorch with mkldnn+acl backend")
556558
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
557-
host.run_cmd(f"cd $HOME && git clone https://github.com/pytorch/builder.git")
558-
host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}")
559+
host.run_cmd("cd $HOME && git clone https://github.com/pytorch/builder.git && cd builder && git checkout release/2.2") # noqa: E501
560+
host.run_cmd("cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/fix-xbyak-failure.patch") # noqa: E501
561+
host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") # noqa: E501
559562
print('Repair the wheel')
560563
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
561-
host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}")
564+
host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}") # noqa: E501
562565
print('replace the original wheel with the repaired one')
563566
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
564567
host.run_cmd(f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}")
@@ -706,7 +709,7 @@ def parse_arguments():
706709
parser.add_argument("--build-only", action="store_true")
707710
parser.add_argument("--test-only", type=str)
708711
parser.add_argument("--os", type=str, choices=list(os_amis.keys()), default='ubuntu20_04')
709-
parser.add_argument("--python-version", type=str, choices=['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'], default=None)
712+
parser.add_argument("--python-version", type=str, choices=[f'3.{d}' for d in range(6, 12)], default=None)
710713
parser.add_argument("--alloc-instance", action="store_true")
711714
parser.add_argument("--list-instances", action="store_true")
712715
parser.add_argument("--pytorch-only", action="store_true")

aarch64_linux/embed_library.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414

1515
def replace_tag(filename):
16-
with open(filename, 'r') as f:
16+
with open(filename) as f:
1717
lines = f.read().split("\\n")
1818
for i,line in enumerate(lines):
1919
if not line.startswith("Tag: "):
@@ -42,7 +42,7 @@ def embed_library(whl_path, lib_soname, update_tag=False):
4242
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
4343
ctx.out_wheel=tmp_whl_name
4444
new_lib_path, new_lib_soname = None, None
45-
for filename, elf in elf_file_filter(ctx.iter_files()):
45+
for filename, _ in elf_file_filter(ctx.iter_files()):
4646
if not filename.startswith('torch/lib'):
4747
continue
4848
libtree = lddtree(filename)

mkldnn_fix/aarch64-fix-readdir-crash.patch

Lines changed: 0 additions & 14 deletions
This file was deleted.

mkldnn_fix/fix-xbyak-failure.patch

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
cpu: aarch64: fix xbyak functions for /sys access failures
2+
3+
There are platforms with /sys not mounted. skip handling HW caps
4+
for such platforms.
5+
6+
This fixes the issue# pytorch/pytorch#115482
7+
---
8+
.../xbyak_aarch64/src/util_impl_linux.h | 24 ++++++++++++++-----
9+
.../aarch64/xbyak_aarch64/src/util_impl_mac.h | 9 ++++---
10+
2 files changed, 24 insertions(+), 9 deletions(-)
11+
12+
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
13+
index 2c7b28e58b..860a05700f 100644
14+
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
15+
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
16+
@@ -144,8 +144,13 @@ private:
17+
regex_t regexBuf;
18+
regmatch_t match[1];
19+
20+
- if (regcomp(&regexBuf, regex, REG_EXTENDED) != 0)
21+
- throw ERR_INTERNAL;
22+
+ if (regcomp(&regexBuf, regex, REG_EXTENDED) != 0) {
23+
+ /* There are platforms with /sys not mounted. return empty buffers
24+
+ * in these scenarios
25+
+ */
26+
+ buf[0] = '\0';
27+
+ return 0;
28+
+ }
29+
30+
const int retVal = regexec(&regexBuf, path, 1, match, 0);
31+
regfree(&regexBuf);
32+
@@ -187,8 +192,12 @@ private:
33+
regex_t regexBuf;
34+
regmatch_t match[2];
35+
36+
- if (regcomp(&regexBuf, "index[0-9]*$", REG_EXTENDED) != 0)
37+
- throw ERR_INTERNAL;
38+
+ if (regcomp(&regexBuf, "index[0-9]*$", REG_EXTENDED) != 0) {
39+
+ /* There are platforms with /sys not mounted. return gracefully
40+
+ * in these scenarios
41+
+ */
42+
+ goto init_and_return_false;
43+
+ }
44+
45+
if (regexec(&regexBuf, dp->d_name, 1, match, 0) == 0) { // Found index[1-9][0-9]. directory
46+
char *dir_name = buf0;
47+
@@ -438,12 +447,15 @@ private:
48+
49+
FILE *file = fopen(path_midr_el1, "r");
50+
if (file == nullptr) {
51+
- throw Error(ERR_INTERNAL);
52+
+ /* There are platforms with /sys not mounted. return empty buffer
53+
+ * in these scenarios
54+
+ */
55+
+ cacheInfo_.midr_el1 = 0xFE << 24;
56+
return;
57+
}
58+
59+
if (fread(buf, sizeof(char), 64, file) == 0) {
60+
- throw Error(ERR_INTERNAL);
61+
+ cacheInfo_.midr_el1 = 0xFE << 24;
62+
return;
63+
}
64+
65+
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
66+
index ebd6dba7c0..93bdae1d7a 100644
67+
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
68+
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
69+
@@ -102,18 +102,21 @@ private:
70+
size_t val = 0;
71+
size_t len = sizeof(val);
72+
73+
+ /* There are platforms with /sys not mounted. skip
74+
+ * handling HW caps for such platforms.
75+
+ */
76+
if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
77+
- throw Error(ERR_INTERNAL);
78+
+ type_ = 0;
79+
else
80+
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ATOMIC : 0;
81+
82+
if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
83+
- throw Error(ERR_INTERNAL);
84+
+ type_ = 0;
85+
else
86+
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_FP : 0;
87+
88+
if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
89+
- throw Error(ERR_INTERNAL);
90+
+ type_ = 0;
91+
else
92+
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ADVSIMD : 0;
93+
}
94+
--
95+
2.34.1
96+

0 commit comments

Comments
 (0)