blob: 44062a2f9eef48bc268a8ccb4e9c537cb429c235 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
|
# Copyright 1999-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# shellcheck disable=SC2317
EAPI=8
ROCM_VERSION=${PV}
PYTHON_COMPAT=( python3_{10..13} python3_13t )
inherit check-reqs cmake flag-o-matic multiprocessing python-r1 rocm
GTEST_COMMIT="b85864c64758dec007208e56af933fc3f52044ee"
GTEST_FILE="gtest-1.14.0_p20220421.tar.gz"
DESCRIPTION="High Performance Composable Kernel for AMD GPUs"
HOMEPAGE="https://github.com/ROCm/composable_kernel"
SRC_URI="https://github.com/ROCm/composable_kernel/archive/rocm-${PV}.tar.gz -> ${P}.tar.gz
test? ( https://github.com/google/googletest/archive/${GTEST_COMMIT}.tar.gz -> ${GTEST_FILE} )"
S="${WORKDIR}/composable_kernel-rocm-${PV}"
LICENSE="MIT"
SLOT="0/$(ver_cut 1-2)"
KEYWORDS="~amd64"
IUSE="debug profiler test"
REQUIRED_USE="${ROCM_REQUIRED_USE} ${PYTHON_REQUIRED_USE}"
RESTRICT="!test? ( test )"
RDEPEND="
dev-util/hip:${SLOT}
${PYTHON_DEPS}
"
DEPEND="${RDEPEND}"
BDEPEND="
dev-build/rocm-cmake
"
PATCHES=(
"${FILESDIR}"/${PN}-6.1.1-enable-examples.patch
"${FILESDIR}"/${PN}-6.1.1-no-git-no-hash.patch
"${FILESDIR}"/${PN}-6.3.0-no-inline-all.patch
"${FILESDIR}"/${PN}-6.3.0-conditional-kernels.patch
"${FILESDIR}"/${PN}-6.3.0-conditional-ckprofiler.patch
"${FILESDIR}"/${PN}-6.3.0-expand-isa.patch
)
ck_check-reqs() {
[[ ${MERGE_TYPE} == binary ]] && return
targets=($AMDGPU_TARGETS)
if [[ ${#targets[@]} -gt 1 ]]; then
ewarn "composable-kernel will be compiled for multiple GPU architectures,"
ewarn "which will take a significant amount of time."
ewarn "Please consider setting AMDGPU_TARGETS USE_EXPAND variable to a single architecture."
fi
# It takes ~2Gb of RAM per build thread
local user_jobs=$(makeopts_jobs)
local free_memory_mb=$(free -m | awk '/Mem:/ {print $4}')
local max_jobs=$(( free_memory_mb / 2048 ))
max_jobs=$(( max_jobs < 1 ? 1 : max_jobs ))
local limited_jobs=$(( user_jobs < max_jobs ? user_jobs : max_jobs ))
if [[ "${max_jobs}" -lt "${user_jobs}" ]]; then
ewarn "${free_memory_mb} MB of free RAM is not enough for ${user_jobs} parallel build jobs (~2Gb per job)."
ewarn "Please consider setting MAKEOPTS=\"-j${limited_jobs}\" for this package."
fi
local CHECKREQS_MEMORY=$((user_jobs*2048))M
check-reqs_${EBUILD_PHASE_FUNC}
}
pkg_pretend() {
ck_check-reqs
}
pkg_setup() {
ck_check-reqs
}
src_prepare() {
sed -e '/-Werror/d' -i cmake/EnableCompilerWarnings.cmake || die
cmake_src_prepare
}
src_configure() {
rocm_use_hipcc
if ! use debug; then
append-cflags "-DNDEBUG"
append-cxxflags "-DNDEBUG"
CMAKE_BUILD_TYPE="Release"
else
CMAKE_BUILD_TYPE="Debug"
fi
local mycmakeargs=(
-DCMAKE_SKIP_RPATH=ON
-DBUILD_DEV=OFF
-DGPU_TARGETS="$(get_amdgpu_flags)"
-DCMAKE_INSTALL_PREFIX="${EPREFIX}/usr"
-DBUILD_TESTING=$(usex test ON OFF)
-DCK_USE_PROFILER=$(usex profiler ON OFF)
-Wno-dev
)
if use test; then
mycmakeargs+=(
-DFETCHCONTENT_SOURCE_DIR_GTEST="${WORKDIR}/googletest-${GTEST_COMMIT}"
)
fi
cmake_src_configure
}
src_install() {
cmake_src_install
installation() {
python_domodule python/ck4inductor
# install package-data manually, as there is no PEP517 compliance
shopt -s globstar
package_data=(
include/ck/**/*.hpp
library/src/tensor_operation_instance/gpu/gemm_universal/**/*.hpp
)
shopt -u globstar
inst_path="${D}$(python_get_sitedir)/ck4inductor"
for file in "${package_data[@]}"; do
location="${inst_path}/$(dirname "$file")"
mkdir -p "${location}"
cp "${file}" "${location}"
done
}
python_foreach_impl installation
}
src_test() {
check_amdgpu
LD_LIBRARY_PATH="${BUILD_DIR}"/lib cmake_src_test -j1
}
|