#!/bin/bash


module load compilers/cuda/12.1
export PATH=/home/bingxing2/home/scx6002/caijq/dontdelete/jichu/benchmark-tool:$PATH



for i in {0..7}; do
    echo "test for GPU $i"
    #FP64
    echo FP64
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=ddd -m=15360 -n=18176 -k=8192 -T=8 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #FP32
    echo FP32
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=sss -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #TF32
    echo TF32
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=sss_fast_tf32 -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #FP16
    echo "FP16(hhh)"
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=hhh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #hsh
    echo hsh
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=hsh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #hss
    echo hss
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=hss -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #BF16
    echo BF16
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=tst -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'
    #FP8
    echo FP8
    CUDA_VISIBLE_DEVICES=$i cublasMatmulBench -P=qqssq -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'

done
