meta-pytorch · xuzhao9 · Dec 2, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/benchmarks/nightly/autogen.yaml b/benchmarks/nightly/autogen.yaml
@@ -155,3 +155,7 @@ vector_exp_bwd:
 welford_fwd:
   args: --op welford --baseline eager_layer_norm --metrics latency,speedup --only
     test_no_welford,triton_welford,eager_layer_norm
+bf16_flex_attention_fwd:
+  args: --op flex_attention --metrics latency,tflops --only compiled
+bf16_flex_attention_bwd:
+  args: --op flex_attention --metrics latency,tflops --only compiled
diff --git a/benchmarks/nightly/manual.yaml b/benchmarks/nightly/manual.yaml
@@ -22,3 +22,7 @@ extra_args:
   # flash_attention triton_tutorial_flash_v2 impl only supports causal in backward
   bf16_flash_attention_bwd:
     args: --op flash_attention --baseline flash_v3 --metrics latency,tflops,speedup --bwd --only triton_tutorial_flash_v2,flash_v3 --causal
+  bf16_flex_attention_fwd:
+    args: --op flex_attention --metrics latency,tflops --only compiled
+  bf16_flex_attention_bwd:
+    args: --op flex_attention --metrics latency,tflops --only compiled
diff --git a/benchmarks/tagging/run.py b/benchmarks/tagging/run.py
@@ -184,13 +184,6 @@ def trace_op(op):
     return op_with_tags
 
 
-UNSUPPORTED_OPS = [
-    "fp8_fused_quant_gemm_rowwise",
-    "fp32_to_mx4",
-    "flex_attention",
-    "mx4_to_fp32",
-]
-
 if __name__ == "__main__":
     parser = get_parser()
     args = parser.parse_args()
@@ -201,9 +194,6 @@ def trace_op(op):
     print(f"Running tagging test on ops: {ops}...")
     results = {}
     for op in ops:
-        # deadloop on flex_attention
-        if op in UNSUPPORTED_OPS:
-            continue
         results.update(trace_op(op))
     if not args.output:
         print(results)

diff --git a/benchmarks/tritonparse_sweep/run.py b/benchmarks/tritonparse_sweep/run.py
@@ -41,10 +41,10 @@ def setup_tritonbench_cwd():
 setup_tritonbench_cwd()
 
 import tritonparse
-from tritonparse.reproducer.orchestrator import reproduce as tritonparse_reproduce
-from tritonparse.reproducer.types import KernelImportMode
 from tritonbench.operators_collection import list_operators_by_collection
 from tritonbench.utils.run_utils import run_in_task, setup_output_dir
+from tritonparse.reproducer.orchestrator import reproduce as tritonparse_reproduce
+from tritonparse.reproducer.types import KernelImportMode
 
 NOT_WORKING_OPS = ["tritonparse_softmax_triton_softmax"]
 
@@ -93,12 +93,12 @@ def find_ndjson_files(log_dir):
 
 
 def find_reproducer_script(output: str):
-    output_line: list[str] = [ x for x in output.splitlines() if "repro_script" in x ]
+    output_line: list[str] = [x for x in output.splitlines() if "repro_script" in x]
     if len(output_line) == 0:
         return None
-    output_line = output_line[0][output_line[0].find("{"):].strip()
+    output_line = output_line[0][output_line[0].find("{") :].strip()
     output_dict = eval(output_line)
-    return output_dict['repro_script']
+    return output_dict["repro_script"]
 
 
 def run_repro_script(repro_script):

diff --git a/install.py b/install.py
@@ -56,28 +56,51 @@ def install_jax(cuda_version=DEFAULT_CUDA_VERSION):
 def install_fbgemm(genai=True):
     cmd = ["pip", "install", "-r", "requirements.txt"]
     subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
-    # Build target A100(8.0) or H100(9.0, 9.0a)
+    # Build target H100(9.0, 9.0a) and blackwell (10.0, 12.0)
+    extra_envs = os.environ.copy()
     if genai:
-        cmd = [
-            sys.executable,
-            "setup.py",
-            "install",
-            "--build-target=genai",
-            "-DTORCH_CUDA_ARCH_LIST=8.0;9.0;9.0a",
-        ]
+        if not is_hip():
+            cmd = [
+                sys.executable,
+                "setup.py",
+                "install",
+                "--build-target=genai",
+                "-DTORCH_CUDA_ARCH_LIST=9.0;9.0a;10.0;12.0",
+            ]
+        elif is_hip():
+            # build for MI300(gfx942) and MI350(gfx950)
+            current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
+            cmd = [
+                "bash",
+                "-c",
+                f". .github/scripts/setup_env.bash; test_fbgemm_gpu_build_and_install {current_conda_env} genai/rocm",
+            ]
+            extra_envs["BUILD_ROCM_VERSION"] = "7.0"
+            subprocess.check_call(
+                cmd, cwd=str(FBGEMM_PATH.parent.resolve()), env=extra_envs
+            )
+            return
     else:
         cmd = [
             sys.executable,
             "setup.py",
             "install",
             "--build-target=cuda",
-            "-DTORCH_CUDA_ARCH_LIST=8.0;9.0;9.0a",
+            "-DTORCH_CUDA_ARCH_LIST=9.0;9.0a;10.0;12.0",
         ]
-    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
+    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()), env=extra_envs)
 
 
 def test_fbgemm():
     print("Checking fbgemm_gpu installation...", end="")
+    # test triton
+    cmd = [
+        sys.executable,
+        "-c",
+        "import fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm",
+    ]
+    subprocess.check_call(cmd)
+    # test genai (cutlass or ck)
     cmd = [sys.executable, "-c", "import fbgemm_gpu.experimental.gen_ai"]
     subprocess.check_call(cmd)
     print("OK")
@@ -118,6 +141,8 @@ def setup_hip(args: argparse.Namespace):
     # We have to disable all third-parties that donot support hip/rocm
     args.all = False
     args.liger = True
+    args.aiter = True
+    args.fbgemm = True
 
 
 if __name__ == "__main__":

diff --git a/submodules/aiter b/submodules/aiter
diff --git a/tools/aiter/install.py b/tools/aiter/install.py
@@ -16,5 +16,5 @@ def pip_install_requirements():
 
 def install_aiter():
     pip_install_requirements()
-    cmd = ["python", "setup.py", "develop"]
+    cmd = ["pip", "install", "-e", "."]
     subprocess.check_call(cmd, cwd=AITER_PATH)
diff --git a/tritonbench/operators/fp8_gemm_rowwise/operator.py b/tritonbench/operators/fp8_gemm_rowwise/operator.py
@@ -100,7 +100,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
     HAS_CUTLASS_OR_CK = is_hip() or (
         is_cuda() and get_nvidia_gpu_model() != "NVIDIA B200"
     )
-except (ImportError, AttributeError, FileNotFoundError):
+except (ImportError, AttributeError, FileNotFoundError, OSError):
     HAS_CUTLASS_OR_CK = False
 
 try:
@@ -111,7 +111,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
 
     # TODO: remove these b200 hacks.
     HAS_CUBLAS = is_cuda() and get_nvidia_gpu_model() != "NVIDIA B200"
-except (ImportError, IOError, AttributeError, FileNotFoundError):
+except (ImportError, IOError, AttributeError, FileNotFoundError, OSError):
     HAS_CUBLAS = False
 
 

diff --git a/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py b/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py
@@ -180,7 +180,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
     cutlass_or_ck_fp8_grouped_mm = torch.ops.fbgemm.f8f8bf16_rowwise_grouped_stacked
     # Set HAS_CUTLASS_OR_CK to True if import succeeds
     HAS_CUTLASS_OR_CK = True
-except (ImportError, AttributeError):
+except (ImportError, AttributeError, OSError):
     # Set HAS_CUTLASS_OR_CK to False if import fails
     HAS_CUTLASS_OR_CK = False
 

diff --git a/tritonbench/utils/python_utils.py b/tritonbench/utils/python_utils.py
@@ -9,5 +9,5 @@ def try_import(cond_name: str):
     try:
         yield
         _caller_globals[cond_name] = True
-    except (ImportError, ModuleNotFoundError) as e:
+    except (ImportError, ModuleNotFoundError, OSError) as e:
         _caller_globals[cond_name] = False