Skip to content
4 changes: 4 additions & 0 deletions benchmarks/nightly/autogen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,7 @@ vector_exp_bwd:
welford_fwd:
args: --op welford --baseline eager_layer_norm --metrics latency,speedup --only
test_no_welford,triton_welford,eager_layer_norm
bf16_flex_attention_fwd:
args: --op flex_attention --metrics latency,tflops --only compiled
bf16_flex_attention_bwd:
args: --op flex_attention --metrics latency,tflops --only compiled
4 changes: 4 additions & 0 deletions benchmarks/nightly/manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ extra_args:
# flash_attention triton_tutorial_flash_v2 impl only supports causal in backward
bf16_flash_attention_bwd:
args: --op flash_attention --baseline flash_v3 --metrics latency,tflops,speedup --bwd --only triton_tutorial_flash_v2,flash_v3 --causal
bf16_flex_attention_fwd:
args: --op flex_attention --metrics latency,tflops --only compiled
bf16_flex_attention_bwd:
args: --op flex_attention --metrics latency,tflops --only compiled
10 changes: 0 additions & 10 deletions benchmarks/tagging/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,6 @@ def trace_op(op):
return op_with_tags


UNSUPPORTED_OPS = [
"fp8_fused_quant_gemm_rowwise",
"fp32_to_mx4",
"flex_attention",
"mx4_to_fp32",
]

if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
Expand All @@ -201,9 +194,6 @@ def trace_op(op):
print(f"Running tagging test on ops: {ops}...")
results = {}
for op in ops:
# deadloop on flex_attention
if op in UNSUPPORTED_OPS:
continue
results.update(trace_op(op))
if not args.output:
print(results)
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/tritonparse_sweep/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def setup_tritonbench_cwd():
setup_tritonbench_cwd()

import tritonparse
from tritonparse.reproducer.orchestrator import reproduce as tritonparse_reproduce
from tritonparse.reproducer.types import KernelImportMode
from tritonbench.operators_collection import list_operators_by_collection
from tritonbench.utils.run_utils import run_in_task, setup_output_dir
from tritonparse.reproducer.orchestrator import reproduce as tritonparse_reproduce
from tritonparse.reproducer.types import KernelImportMode

NOT_WORKING_OPS = ["tritonparse_softmax_triton_softmax"]

Expand Down Expand Up @@ -93,12 +93,12 @@ def find_ndjson_files(log_dir):


def find_reproducer_script(output: str):
output_line: list[str] = [ x for x in output.splitlines() if "repro_script" in x ]
output_line: list[str] = [x for x in output.splitlines() if "repro_script" in x]
if len(output_line) == 0:
return None
output_line = output_line[0][output_line[0].find("{"):].strip()
output_line = output_line[0][output_line[0].find("{") :].strip()
output_dict = eval(output_line)
return output_dict['repro_script']
return output_dict["repro_script"]


def run_repro_script(repro_script):
Expand Down
45 changes: 35 additions & 10 deletions install.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,51 @@ def install_jax(cuda_version=DEFAULT_CUDA_VERSION):
def install_fbgemm(genai=True):
cmd = ["pip", "install", "-r", "requirements.txt"]
subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
# Build target A100(8.0) or H100(9.0, 9.0a)
# Build target H100(9.0, 9.0a) and blackwell (10.0, 12.0)
extra_envs = os.environ.copy()
if genai:
cmd = [
sys.executable,
"setup.py",
"install",
"--build-target=genai",
"-DTORCH_CUDA_ARCH_LIST=8.0;9.0;9.0a",
]
if not is_hip():
cmd = [
sys.executable,
"setup.py",
"install",
"--build-target=genai",
"-DTORCH_CUDA_ARCH_LIST=9.0;9.0a;10.0;12.0",
]
elif is_hip():
# build for MI300(gfx942) and MI350(gfx950)
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
cmd = [
"bash",
"-c",
f". .github/scripts/setup_env.bash; test_fbgemm_gpu_build_and_install {current_conda_env} genai/rocm",
]
extra_envs["BUILD_ROCM_VERSION"] = "7.0"
subprocess.check_call(
cmd, cwd=str(FBGEMM_PATH.parent.resolve()), env=extra_envs
)
return
else:
cmd = [
sys.executable,
"setup.py",
"install",
"--build-target=cuda",
"-DTORCH_CUDA_ARCH_LIST=8.0;9.0;9.0a",
"-DTORCH_CUDA_ARCH_LIST=9.0;9.0a;10.0;12.0",
]
subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()), env=extra_envs)


def test_fbgemm():
print("Checking fbgemm_gpu installation...", end="")
# test triton
cmd = [
sys.executable,
"-c",
"import fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm",
]
subprocess.check_call(cmd)
# test genai (cutlass or ck)
cmd = [sys.executable, "-c", "import fbgemm_gpu.experimental.gen_ai"]
subprocess.check_call(cmd)
print("OK")
Expand Down Expand Up @@ -118,6 +141,8 @@ def setup_hip(args: argparse.Namespace):
# We have to disable all third-parties that donot support hip/rocm
args.all = False
args.liger = True
args.aiter = True
args.fbgemm = True


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion submodules/aiter
Submodule aiter updated 3104 files
2 changes: 1 addition & 1 deletion tools/aiter/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ def pip_install_requirements():

def install_aiter():
pip_install_requirements()
cmd = ["python", "setup.py", "develop"]
cmd = ["pip", "install", "-e", "."]
subprocess.check_call(cmd, cwd=AITER_PATH)
4 changes: 2 additions & 2 deletions tritonbench/operators/fp8_gemm_rowwise/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
HAS_CUTLASS_OR_CK = is_hip() or (
is_cuda() and get_nvidia_gpu_model() != "NVIDIA B200"
)
except (ImportError, AttributeError, FileNotFoundError):
except (ImportError, AttributeError, FileNotFoundError, OSError):
HAS_CUTLASS_OR_CK = False

try:
Expand All @@ -111,7 +111,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:

# TODO: remove these b200 hacks.
HAS_CUBLAS = is_cuda() and get_nvidia_gpu_model() != "NVIDIA B200"
except (ImportError, IOError, AttributeError, FileNotFoundError):
except (ImportError, IOError, AttributeError, FileNotFoundError, OSError):
HAS_CUBLAS = False


Expand Down
2 changes: 1 addition & 1 deletion tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
cutlass_or_ck_fp8_grouped_mm = torch.ops.fbgemm.f8f8bf16_rowwise_grouped_stacked
# Set HAS_CUTLASS_OR_CK to True if import succeeds
HAS_CUTLASS_OR_CK = True
except (ImportError, AttributeError):
except (ImportError, AttributeError, OSError):
# Set HAS_CUTLASS_OR_CK to False if import fails
HAS_CUTLASS_OR_CK = False

Expand Down
2 changes: 1 addition & 1 deletion tritonbench/utils/python_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ def try_import(cond_name: str):
try:
yield
_caller_globals[cond_name] = True
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, OSError) as e:
_caller_globals[cond_name] = False
Loading