diff --git a/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json index 327b004cb..957b028bb 100644 --- a/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen1024_vlen1024_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 1975, - "eval.ntt_128": 3573, - "eval.ntt_256": 5247, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2305, + "eval.ntt_128": 4337, + "eval.ntt_256": 6100, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json index 438652fec..5837d21a1 100644 --- a/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen1024_vlen2048_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 1975, - "eval.ntt_128": 3573, - "eval.ntt_256": 5229, - "eval.ntt_512": 8099, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2305, + "eval.ntt_128": 4337, + "eval.ntt_256": 6083, + "eval.ntt_512": 8057, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json index 83c2e6f7c..bce8a91fc 100644 --- a/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen1024_vlen4096_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 1975, - "eval.ntt_128": 3573, - "eval.ntt_256": 5229, - "eval.ntt_512": 8096, - "eval.ntt_1024": 20939, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2305, + "eval.ntt_128": 4337, + "eval.ntt_256": 6083, + "eval.ntt_512": 8057, + "eval.ntt_1024": 21131, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json index 2c265ea46..c74703241 100644 --- a/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen1024_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2757, - "eval.ntt_128": 4738, - "eval.ntt_256": 8777, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 3087, + "eval.ntt_128": 5428, + "eval.ntt_256": 9060, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json index c63a22ccf..423803b5a 100644 --- a/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen128_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 58036, - "eval.ntt_128": 58871, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 57783, + "eval.ntt_128": 57779, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json index 39d13ae70..4851f4241 100644 --- a/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen2048_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2757, - "eval.ntt_128": 4734, - "eval.ntt_256": 8721, - "eval.ntt_512": 18629, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 3087, + "eval.ntt_128": 5428, + "eval.ntt_256": 9560, + "eval.ntt_512": 18674, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json index ce07a3e7f..700353d5d 100644 --- a/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen256_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2757, - "eval.ntt_128": 58871, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 3087, + "eval.ntt_128": 57779, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json index 3c42c0a5c..d04a9522a 100644 --- a/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen4096_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2757, - "eval.ntt_128": 4734, - "eval.ntt_256": 8736, - "eval.ntt_512": 18627, - "eval.ntt_1024": 43035, - "eval.ntt_4096": 56564 + "eval.ntt_64": 3087, + "eval.ntt_128": 5428, + "eval.ntt_256": 9560, + "eval.ntt_512": 18673, + "eval.ntt_1024": 43078, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json index 43fe157ce..4a30f6553 100644 --- a/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen128_vlen512_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2757, - "eval.ntt_128": 4722, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 3087, + "eval.ntt_128": 5429, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json index 8d19b941b..b30e4d553 100644 --- a/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen256_vlen1024_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2192, - "eval.ntt_128": 3469, - "eval.ntt_256": 6153, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2522, + "eval.ntt_128": 4082, + "eval.ntt_256": 6218, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json index 2a0fa5e9c..7674364c8 100644 --- a/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen256_vlen2048_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2192, - "eval.ntt_128": 3469, - "eval.ntt_256": 6153, - "eval.ntt_512": 10429, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2522, + "eval.ntt_128": 4082, + "eval.ntt_256": 6218, + "eval.ntt_512": 10473, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json index 6233f5bd6..0f7057718 100644 --- a/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen256_vlen256_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2203, - "eval.ntt_128": 58871, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2533, + "eval.ntt_128": 57779, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json index abd09b0d3..ee6ec7105 100644 --- a/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen256_vlen4096_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2192, - "eval.ntt_128": 3469, - "eval.ntt_256": 6153, - "eval.ntt_512": 10852, - "eval.ntt_1024": 26612, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2522, + "eval.ntt_128": 4082, + "eval.ntt_256": 6218, + "eval.ntt_512": 10484, + "eval.ntt_1024": 26525, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json index bfa444787..60ecc84b1 100644 --- a/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen256_vlen512_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2196, - "eval.ntt_128": 3335, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2527, + "eval.ntt_128": 4093, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json index aa53562bc..734b209e3 100644 --- a/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen512_vlen1024_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2012, - "eval.ntt_128": 3278, - "eval.ntt_256": 5090, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2340, + "eval.ntt_128": 3717, + "eval.ntt_256": 6119, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json index 7946f0d8b..e9faa7eb2 100644 --- a/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen512_vlen2048_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2012, - "eval.ntt_128": 3276, - "eval.ntt_256": 5090, - "eval.ntt_512": 9059, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2340, + "eval.ntt_128": 3717, + "eval.ntt_256": 6119, + "eval.ntt_512": 9228, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json index f4bbdd2f9..0a841927a 100644 --- a/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen512_vlen4096_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2012, - "eval.ntt_128": 3276, - "eval.ntt_256": 5090, - "eval.ntt_512": 9059, - "eval.ntt_1024": 22351, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2340, + "eval.ntt_128": 3717, + "eval.ntt_256": 6119, + "eval.ntt_512": 9223, + "eval.ntt_1024": 22370, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json b/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json index 914a533bb..255a1686d 100644 --- a/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json +++ b/.github/designs/benchmark_dlen512_vlen512_fp/t1rocketemu.json @@ -1,8 +1,8 @@ { - "eval.ntt_64": 2012, - "eval.ntt_128": 3287, - "eval.ntt_256": 58884, - "eval.ntt_512": 58013, - "eval.ntt_1024": 57728, - "eval.ntt_4096": 56564 + "eval.ntt_64": 2340, + "eval.ntt_128": 3724, + "eval.ntt_256": 57922, + "eval.ntt_512": 57515, + "eval.ntt_1024": 58997, + "eval.ntt_4096": 56320 } \ No newline at end of file diff --git a/.github/designs/blastoise/t1rocketemu.json b/.github/designs/blastoise/t1rocketemu.json index 18b57abe5..e6917fc34 100644 --- a/.github/designs/blastoise/t1rocketemu.json +++ b/.github/designs/blastoise/t1rocketemu.json @@ -1,9 +1,9 @@ { - "asm.memcpy": 1813, - "asm.mmm": 52503, - "asm.smoke": 8556, - "asm.strlen": 14192, - "asm.utf8_count": 505, + "asm.memcpy": 2260, + "asm.mmm": 52837, + "asm.smoke": 9061, + "asm.strlen": 14589, + "asm.utf8_count": 859, "codegen.vaadd_vv": 509657, "codegen.vaadd_vx": 1082497, "codegen.vaaddu_vv": 509657, @@ -499,30 +499,30 @@ "codegen.vxor_vx": 272297, "codegen.vzext_vf2": 35727, "codegen.vzext_vf4": 9179, - "intrinsic.conv2d_less_m2": 4359, - "intrinsic.linear_normalization": 4818, + "intrinsic.conv2d_less_m2": 4684, + "intrinsic.linear_normalization": 5505, "intrinsic.matmul": 156950, - "intrinsic.softmax": 9003, - "mlir.axpy_masked": 11636, - "mlir.conv": 298996, - "mlir.hello": 371, - "mlir.matmul": 75518, - "mlir.maxvl_tail_setvl_front": 2375, - "mlir.rvv_vp_intrinsic_add": 774, - "mlir.rvv_vp_intrinsic_add_scalable": 1173, - "mlir.stripmining": 35059, - "mlir.vectoradd": 72377, - "pytorch.demo": 122735, - "pytorch.matmul": 109796, - "rvv_bench.ascii_to_utf16": 1383013, - "rvv_bench.ascii_to_utf32": 440566, - "rvv_bench.byteswap": 449395, - "rvv_bench.chacha20": 45624, - "rvv_bench.mandelbrot": 563959, - "rvv_bench.memcpy": 1267552, - "rvv_bench.memset": 510029, - "rvv_bench.mergelines": 777753, - "rvv_bench.poly1305": 45624, - "rvv_bench.strlen": 494533, - "rvv_bench.utf8_count": 2659715 + "intrinsic.softmax": 9329, + "mlir.axpy_masked": 11962, + "mlir.conv": 299284, + "mlir.hello": 694, + "mlir.matmul": 75963, + "mlir.maxvl_tail_setvl_front": 2762, + "mlir.rvv_vp_intrinsic_add": 1127, + "mlir.rvv_vp_intrinsic_add_scalable": 1539, + "mlir.stripmining": 35418, + "mlir.vectoradd": 72384, + "pytorch.demo": 123874, + "pytorch.matmul": 109914, + "rvv_bench.ascii_to_utf16": 1383170, + "rvv_bench.ascii_to_utf32": 440610, + "rvv_bench.byteswap": 449384, + "rvv_bench.chacha20": 45889, + "rvv_bench.mandelbrot": 564338, + "rvv_bench.memcpy": 1268232, + "rvv_bench.memset": 509712, + "rvv_bench.mergelines": 770459, + "rvv_bench.poly1305": 45889, + "rvv_bench.strlen": 495094, + "rvv_bench.utf8_count": 2669518 } \ No newline at end of file diff --git a/.github/designs/rookidee/t1rocketemu.json b/.github/designs/rookidee/t1rocketemu.json index 7e8005b9b..9d6b93797 100644 --- a/.github/designs/rookidee/t1rocketemu.json +++ b/.github/designs/rookidee/t1rocketemu.json @@ -1,6 +1,6 @@ { - "asm.mmm": 57732, - "asm.smoke": 8647, + "asm.mmm": 58062, + "asm.smoke": 9136, "codegen.vaadd_vv": 306399, "codegen.vaadd_vx": 819226, "codegen.vaaddu_vv": 306399, @@ -430,19 +430,19 @@ "codegen.vxor_vx": 214202, "codegen.vzext_vf2": 29723, "codegen.vzext_vf4": 11289, - "intrinsic.conv2d_less_m2": 4410, - "mlir.hello": 369, - "mlir.rvv_vp_intrinsic_add": 762, - "mlir.rvv_vp_intrinsic_add_scalable": 1062, - "mlir.stripmining": 58612, - "rvv_bench.ascii_to_utf16": 1385284, - "rvv_bench.ascii_to_utf32": 431848, - "rvv_bench.byteswap": 489626, - "rvv_bench.chacha20": 45624, - "rvv_bench.memcpy": 1285481, - "rvv_bench.memset": 509966, - "rvv_bench.mergelines": 795847, - "rvv_bench.poly1305": 45624, - "rvv_bench.strlen": 529066, - "rvv_bench.utf8_count": 2753737 + "intrinsic.conv2d_less_m2": 4728, + "mlir.hello": 692, + "mlir.rvv_vp_intrinsic_add": 1122, + "mlir.rvv_vp_intrinsic_add_scalable": 1428, + "mlir.stripmining": 58928, + "rvv_bench.ascii_to_utf16": 1386186, + "rvv_bench.ascii_to_utf32": 431879, + "rvv_bench.byteswap": 489489, + "rvv_bench.chacha20": 45889, + "rvv_bench.memcpy": 1285677, + "rvv_bench.memset": 510014, + "rvv_bench.mergelines": 790052, + "rvv_bench.poly1305": 45889, + "rvv_bench.strlen": 530552, + "rvv_bench.utf8_count": 2762239 } \ No newline at end of file diff --git a/nix/overlay.nix b/nix/overlay.nix index 7bfe75237..4a3b3fe54 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -151,4 +151,30 @@ rec { }; t1 = final.callPackage ./t1 { }; + + buddy-codegen = final.callPackage ./pkgs/buddy-codegen { }; + + spike-t1 = final.writeShellApplication { + name = "spike-t1"; + + runtimeInputs = with final; [ + pkgsCross.riscv32-embedded.buildPackages.gcc + spike + dtc + ]; + + text = '' + elf=''${1:-} + if [[ -z "$elf" ]]; then + echo "Require argument to find elf" >&2 + exit 1 + fi + + spike -d --isa=rv32gcv_zvl2048b_zve32f \ + --priv=m \ + -m0x20000000:0x20000000,0x00000000:0x20000000,0x40000000:0x80000000,0xc0000000:0x40000000 \ + --pc=0x"$(grep '<_start>' <(riscv32-none-elf-objdump -d "$elf") | cut -d' ' -f1)" \ + "$elf" + ''; + }; } diff --git a/nix/patches/buddy-mlir/00-fix-splat-op.patch b/nix/patches/buddy-mlir/00-fix-splat-op.patch new file mode 100644 index 000000000..3fd32476c --- /dev/null +++ b/nix/patches/buddy-mlir/00-fix-splat-op.patch @@ -0,0 +1,59 @@ +--- a/frontend/Python/ops/linalg.py 1970-01-01 08:00:01.000000000 +0800 ++++ b/frontend/Python/ops/linalg.py 1970-01-01 08:00:01.000000000 +0800 +@@ -1166,7 +1166,7 @@ + element = mlir_element_attr_get(dtype, 0.0) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result +- op = linalg.matmul(input1, input2, outs=[matmul_result_buffer]) ++ op = linalg.matmul(input1, input2, outputs=[matmul_result_buffer]) + return op + + +@@ -1186,7 +1186,7 @@ + element = mlir_element_attr_get(dtype, 0.0) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + result_buffer = arith.ConstantOp(tensor_type, attr).result +- op = linalg.matmul_transpose_b(input1, input2, outs=[result_buffer]) ++ op = linalg.matmul_transpose_b(input1, input2, outputs=[result_buffer]) + return op + + +@@ -1854,9 +1854,9 @@ + output = tensor.EmptyOp(output_shape, mlir_dtype) + + if not isinstance(input2.type, ir.RankedTensorType): +- input2 = tensor.SplatOp(tensor_type, input2).result ++ input2 = tensor.SplatOp(tensor_type, input2, []).result + if not isinstance(input3.type, ir.RankedTensorType): +- input3 = tensor.SplatOp(tensor_type, input3).result ++ input3 = tensor.SplatOp(tensor_type, input3, []).result + + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] +@@ -2038,7 +2038,7 @@ + input_shape = ir.RankedTensorType(input_tensor.type).shape + tensor_type = ir.RankedTensorType.get(input_shape, input_dtype) + scalar = arith.ConstantOp(input_dtype, node.args[1]) +- rhs = tensor.SplatOp(tensor_type, scalar) ++ rhs = tensor.SplatOp(tensor_type, scalar, []) + if str(input_dtype).find("i") != -1: + cmp_op = arith.CmpIOp(4, input_tensor, rhs) + else: +@@ -2069,7 +2069,7 @@ + tensor_type = ir.RankedTensorType.get(input_shape, input_dtype) + + scalar = arith.ConstantOp(input_dtype, node.args[1]) +- rhs = tensor.SplatOp(tensor_type, scalar) ++ rhs = tensor.SplatOp(tensor_type, scalar, []) + + if str(input_dtype).find("i") != -1: + cmp_op = arith.CmpIOp(5, input_tensor, rhs) +@@ -2390,7 +2390,7 @@ + scalar = arith.ConstantOp(input_dtype, float(node.args[1])) + else: + scalar = arith.ConstantOp(input_dtype, node.args[1]) +- rhs = tensor.SplatOp(tensor_type, scalar) ++ rhs = tensor.SplatOp(tensor_type, scalar, []) + if str(input_dtype).find("i") != -1: + cmp_op = arith.CmpIOp(0, input_tensor, rhs) + else: diff --git a/nix/patches/llvm/fix-vector-convert.patch b/nix/patches/llvm/fix-vector-convert.patch new file mode 100644 index 000000000..9be7dc4a6 --- /dev/null +++ b/nix/patches/llvm/fix-vector-convert.patch @@ -0,0 +1,100 @@ +--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp 1970-01-01 00:00:01.000000000 +0000 ++++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp 2025-04-08 19:56:05.027381997 +0000 +@@ -4512,7 +4512,8 @@ + + // Src needs to have twice the number of elements. + unsigned NumElts = VT.getVectorNumElements(); +- if (Src.getValueType().getVectorNumElements() != (NumElts * 2)) ++ if (!Src.getValueType().isFixedLengthVector() || ++ Src.getValueType().getVectorNumElements() != (NumElts * 2)) + return SDValue(); + + // The extracts must extract the two halves of the source. +--- a/mlir/include/mlir/Conversion/Passes.td 1970-01-01 00:00:01.000000000 +0000 ++++ b/mlir/include/mlir/Conversion/Passes.td 2025-04-08 18:58:09.826017766 +0000 +@@ -1444,6 +1444,9 @@ + "vector::VectorTransformsOptions", + /*default=*/"vector::VectorTransformsOptions()", + "Options to lower some operations like contractions and transposes.">, ++ Option<"indexBitwidth", "index-bitwidth", "unsigned", ++ /*default=kDeriveIndexBitwidthFromDataLayout*/"0", ++ "Bitwidth of the index type, 0 to use size of machine word">, + ]; + } + +--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp 1970-01-01 00:00:01.000000000 +0000 ++++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp 2025-04-08 18:59:47.622433399 +0000 +@@ -1438,8 +1438,6 @@ + if (llvm::any_of(*targetStrides, ShapedType::isDynamic)) + return failure(); + +- auto int64Ty = IntegerType::get(rewriter.getContext(), 64); +- + // Create descriptor. + auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy); + // Set allocated ptr. +@@ -1450,21 +1448,23 @@ + Value ptr = sourceMemRef.alignedPtr(rewriter, loc); + desc.setAlignedPtr(rewriter, loc, ptr); + // Fill offset 0. +- auto attr = rewriter.getIntegerAttr(rewriter.getIndexType(), 0); +- auto zero = rewriter.create(loc, int64Ty, attr); ++ auto idxType = rewriter.getIndexType(); ++ auto zero = rewriter.create( ++ loc, typeConverter->convertType(idxType), ++ rewriter.getIntegerAttr(idxType, 0)); + desc.setOffset(rewriter, loc, zero); + + // Fill size and stride descriptors in memref. + for (const auto &indexedSize : + llvm::enumerate(targetMemRefType.getShape())) { + int64_t index = indexedSize.index(); +- auto sizeAttr = +- rewriter.getIntegerAttr(rewriter.getIndexType(), indexedSize.value()); +- auto size = rewriter.create(loc, int64Ty, sizeAttr); ++ auto size = rewriter.create( ++ loc, typeConverter->convertType(idxType), ++ rewriter.getIntegerAttr(idxType, indexedSize.value())); + desc.setSize(rewriter, loc, index, size); +- auto strideAttr = rewriter.getIntegerAttr(rewriter.getIndexType(), +- (*targetStrides)[index]); +- auto stride = rewriter.create(loc, int64Ty, strideAttr); ++ auto stride = rewriter.create( ++ loc, typeConverter->convertType(idxType), ++ rewriter.getIntegerAttr(idxType, (*targetStrides)[index])); + desc.setStride(rewriter, loc, index, stride); + } + +--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp 1970-01-01 00:00:01.000000000 +0000 ++++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp 2025-04-08 19:00:55.000728446 +0000 +@@ -8,6 +8,7 @@ + + #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" + ++#include "mlir/Analysis/DataLayoutAnalysis.h" + #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" + #include "mlir/Conversion/LLVMCommon/TypeConverter.h" + #include "mlir/Dialect/AMX/AMXDialect.h" +@@ -64,6 +65,8 @@ + // Perform progressive lowering of operations on slices and all contraction + // operations. Also materializes masks, lowers vector.step, rank-reduces FMA, + // applies folding and DCE. ++ Operation *op = getOperation(); ++ const auto &dataLayoutAnalysis = getAnalysis(); + { + RewritePatternSet patterns(&getContext()); + populateVectorToVectorCanonicalizationPatterns(patterns); +@@ -85,8 +88,11 @@ + } + + // Convert to the LLVM IR dialect. +- LowerToLLVMOptions options(&getContext()); +- LLVMTypeConverter converter(&getContext(), options); ++ LowerToLLVMOptions options(&getContext(), ++ dataLayoutAnalysis.getAtOrAbove(op)); ++ if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) ++ options.overrideIndexBitwidth(indexBitwidth); ++ LLVMTypeConverter converter(&getContext(), options, &dataLayoutAnalysis); + RewritePatternSet patterns(&getContext()); + populateVectorTransferLoweringPatterns(patterns); + populateVectorToLLVMMatrixConversionPatterns(converter, patterns); diff --git a/nix/pkgs/buddy-codegen/buddy-codegen.cc b/nix/pkgs/buddy-codegen/buddy-codegen.cc new file mode 100644 index 000000000..c140f81b1 --- /dev/null +++ b/nix/pkgs/buddy-codegen/buddy-codegen.cc @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +#include // For precision control +#include +#include + +template +std::string generate_c_array_code(T *data, size_t size, + const std::string &declare, + const std::string &array_name) { + std::ostringstream oss; + oss << declare << " " << array_name << "[" << size << "] = {\n "; + for (size_t i = 0; i < size; ++i) { + oss << std::fixed << std::setprecision(6) << data[i]; + if (i != size - 1) + oss << ", "; + if ((i + 1) % 10 == 0) + oss << "\n "; + } + + oss << "\n};\n"; + return oss.str(); +} + +int main(int argc, char *argv[]) { + argparse::ArgumentParser program("buddy-codegen"); + + argparse::ArgumentParser img_cmd("img"); + img_cmd.add_description("Convert image to MLIR memref C code"); + img_cmd.add_argument("-i", "--input") + .required() + .help("specify the input file"); + img_cmd.add_argument("-o", "--output") + .required() + .help("specify the output file."); + // TODO: support other format + img_cmd.add_argument("-m", "--image-mode").default_value("rgb"); + + argparse::ArgumentParser arg_cmd("arg"); + arg_cmd.add_description("Convert PyTorch parameter to MLIR memref C code"); + arg_cmd.add_argument("-i", "--input") + .required() + .help("specify the input file"); + arg_cmd.add_argument("-o", "--output") + .required() + .help("specify the output file."); + arg_cmd.add_argument("-s", "--size") + .scan<'d', size_t>() + .required() + .help("specify the parameter size."); + + program.add_subparser(img_cmd); + program.add_subparser(arg_cmd); + + try { + program.parse_args(argc, argv); + } catch (const std::exception &err) { + std::cerr << err.what() << std::endl; + std::cerr << program; + std::exit(1); + } + + if (program.is_subcommand_used(img_cmd)) { + std::string imgPath = img_cmd.get("input"); + std::ofstream outPath; + outPath.open(img_cmd.get("output")); + + std::cout << "[buddy-codegen] Loading image..." << std::endl; + dip::Image input(imgPath, dip::DIP_RGB, true); + // TODO read the col, row using PNG lib + MemRef inputResize = dip::Resize4D_NCHW( + &input, dip::INTERPOLATION_TYPE::BILINEAR_INTERPOLATION, + {1, 3, 224, 224} /*{image_cols, image_rows}*/); + auto sizes = inputResize.getSizes(); + outPath << "#include " << std::endl; + outPath << generate_c_array_code(sizes, 4, "static const int32_t", + "IMAGE_SIZES") + << std::endl; + outPath << generate_c_array_code( + inputResize.getData(), inputResize.getSize(), + "__attribute((section(\".vdata\"))) float", "IMAGE") + << std::endl; + outPath.close(); + std::cout << "[buddy-codegen] code generated" << std::endl; + } else if (program.is_subcommand_used(arg_cmd)) { + std::string argPath = arg_cmd.get("input"); + std::ofstream outPath; + outPath.open(arg_cmd.get("output")); + auto params_size = arg_cmd.get("size"); + + MemRef params({params_size}); + + const auto loadStart = std::chrono::high_resolution_clock::now(); + // Open the parameter file in binary mode. + std::ifstream paramFile(argPath, std::ios::in | std::ios::binary); + if (!paramFile.is_open()) { + throw std::runtime_error("[Error] Failed to open params file!"); + } + std::cout << "[buddy-codegen] Loading params..." << std::endl; + // Read the parameter data into the provided memory reference. + paramFile.read(reinterpret_cast(params.getData()), + sizeof(float) * (params.getSize())); + if (paramFile.fail()) { + throw std::runtime_error("Error occurred while reading params file!"); + } + paramFile.close(); + + outPath << "#include " << std::endl; + outPath << generate_c_array_code(params.getSizes(), 1, + "static const int32_t", "PARAMS_SIZES") + << std::endl; + outPath << generate_c_array_code(params.getData(), params.getSize(), + "__attribute((section(\".vdata\"))) float", + "PARAMS") + << std::endl; + outPath.close(); + + const auto loadEnd = std::chrono::high_resolution_clock::now(); + const std::chrono::duration loadTime = + loadEnd - loadStart; + std::cout << "[buddy-codegen] Params load time: " + << (double)(loadTime.count()) / 1000 << "s" << std::endl; + } +} diff --git a/nix/pkgs/buddy-codegen/default.nix b/nix/pkgs/buddy-codegen/default.nix new file mode 100644 index 000000000..caca78327 --- /dev/null +++ b/nix/pkgs/buddy-codegen/default.nix @@ -0,0 +1,63 @@ +{ + lib, + stdenv, + argparse, + buddy-mlir, + libpng, +}: +stdenv.mkDerivation { + name = "buddy-codegen"; + + src = + with lib.fileset; + toSource { + fileset = unions [ + ./dip.mlir + ./buddy-codegen.cc + ]; + root = ./.; + }; + + buildInputs = [ + libpng + argparse + buddy-mlir + ]; + + env.NIX_CFLAGS_COMPILE = toString [ + # TODO: BMP is now broken + "-lpng" + "-DBUDDY_ENABLE_PNG" + "-O3" + ]; + + buildPhase = '' + runHook preBuild + # We don't need to care about stripmining size here + buddy-opt dip.mlir \ + -lower-dip="DIP-strip-mining=256" \ + -arith-expand \ + -lower-affine \ + -llvm-request-c-wrappers \ + -convert-scf-to-cf \ + -convert-math-to-llvm \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + buddy-translate --mlir-to-llvmir | \ + buddy-llc \ + --filetype=obj \ + -o dip.o + + $CXX ./dip.o ./buddy-codegen.cc -o buddy-codegen + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + mkdir -p $out/bin + cp -v buddy-codegen $out/bin/ + runHook postInstall + ''; +} diff --git a/nix/pkgs/buddy-codegen/dip.mlir b/nix/pkgs/buddy-codegen/dip.mlir new file mode 100644 index 000000000..44a0c194d --- /dev/null +++ b/nix/pkgs/buddy-codegen/dip.mlir @@ -0,0 +1,30 @@ +//===- DIP.mlir -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file provides DIP dialect functions. +// +//===----------------------------------------------------------------------===// +func.func @resize_4d_nchw_nearest_neighbour_interpolation(%inputImage : memref, %horizontal_scaling_factor : f32, %vertical_scaling_factor : f32, %outputImage : memref) attributes{llvm.emit_c_interface} +{ + dip.resize_4d_nchw NEAREST_NEIGHBOUR_INTERPOLATION %inputImage, %horizontal_scaling_factor, %vertical_scaling_factor, %outputImage : memref, f32, f32, memref + return +} + +func.func @resize_4d_nchw_bilinear_interpolation(%inputImage : memref, %horizontal_scaling_factor : f32, %vertical_scaling_factor : f32, %outputImage : memref) attributes{llvm.emit_c_interface} +{ + dip.resize_4d_nchw BILINEAR_INTERPOLATION %inputImage, %horizontal_scaling_factor, %vertical_scaling_factor, %outputImage : memref, f32, f32, memref + return +} diff --git a/nix/pkgs/buddy-llvm.nix b/nix/pkgs/buddy-llvm.nix index 690a7fe62..b04ce38bb 100644 --- a/nix/pkgs/buddy-llvm.nix +++ b/nix/pkgs/buddy-llvm.nix @@ -12,20 +12,25 @@ let ps.pybind11 ps.pyyaml ps.ml-dtypes + ps.nanobind ]); in stdenv.mkDerivation rec { name = "llvm-for-buddy-mlir"; - version = "6c59f0e1b0fb56c909ad7c9aad4bde37dc006ae0"; + version = "3bd3e06f3fe418e24af65457877f40cee0544f9d"; src = fetchFromGitHub { owner = "llvm"; repo = "llvm-project"; rev = version; - hash = "sha256-bMJJ2q1hSh7m0ewclHOmIe7lOHv110rz/P7D3pw8Uiw="; + hash = "sha256-JSquIeA14dXKXO6E8v0HV36fA/+bZypJKkcGMvPxxHI="; }; requiredSystemFeatures = [ "big-parallel" ]; + patches = [ + ../patches/llvm/fix-vector-convert.patch + ]; + propagatedBuildInputs = [ pythonEnv ]; @@ -66,6 +71,7 @@ stdenv.mkDerivation rec { # move all lib files to $lib except lib/cmake moveToOutput "lib" "$lib" + moveToOutput "python_packages" "$lib" moveToOutput "lib/cmake" "$dev" moveToOutput "src" "$dev" diff --git a/nix/pkgs/buddy-mlir.nix b/nix/pkgs/buddy-mlir.nix index 6cb60c8d7..2491b0beb 100644 --- a/nix/pkgs/buddy-mlir.nix +++ b/nix/pkgs/buddy-mlir.nix @@ -1,4 +1,5 @@ { + lib, cmake, ninja, llvmPackages_17, @@ -17,12 +18,22 @@ let version = "unstable-2024-07-18"; src = fetchFromGitHub { - owner = "buddy-compiler"; + owner = "WuXintong123"; repo = "buddy-mlir"; - rev = "c57584a0e3c38e938a3902320f62b202ced84996"; - hash = "sha256-IBsShnkaA0qPkEMbkkSjUMWXnDGW/CrTeiSSLLttlXk="; + rev = "6586555adf921371906fe908293714bff4d92b24"; + hash = "sha256-NDdj72oNhIKcU7cOw+RDzPrjKLIUVY63TDUrJ2DzYL0="; }; + patches = [ + ../patches/buddy-mlir/00-fix-splat-op.patch + ]; + + postPatch = '' + sed -i \ + 's|link_directories(''${LLVM_BINARY_DIR}/tools/mlir/|link_directories(''${LLVM_BINARY_DIR}/|' \ + midend/python/CMakeLists.txt + ''; + nativeBuildInputs = [ cmake ninja @@ -33,8 +44,8 @@ let ]; cmakeFlags = [ - "-DMLIR_DIR=${buddy-llvm.dev}/lib/cmake/mlir" - "-DLLVM_DIR=${buddy-llvm.dev}/lib/cmake/llvm" + "-DMLIR_DIR=${buddy-llvm}/lib/cmake/mlir" + "-DLLVM_DIR=${buddy-llvm}/lib/cmake/llvm" "-DLLVM_MAIN_SRC_DIR=${buddy-llvm.src}/llvm" "-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON" "-DCMAKE_BUILD_TYPE=Release" @@ -43,11 +54,17 @@ let # No need to do check, and it also takes too much time to finish. doCheck = false; + # TODO: Upstream this to Buddy-MLIR cmake install + postInstall = '' + mkdir -p "$out/include" + cp -vr "$NIX_BUILD_TOP/$sourceRoot/frontend/Interfaces/buddy" "$out/include" + ''; + # Here we concatenate the LLVM and Buddy python module into one directory for easier import postFixup = '' mkdir -p $out/lib/python${python3.pythonVersion}/site-packages cp -vr $out/python_packages/buddy $out/lib/python${python3.pythonVersion}/site-packages/ - cp -vr ${buddy-llvm}/python_packages/mlir_core/mlir $out/lib/python${python3.pythonVersion}/site-packages/ + cp -vr ${buddy-llvm.lib}/python_packages/mlir_core/mlir $out/lib/python${python3.pythonVersion}/site-packages/ ''; passthru = { @@ -69,6 +86,7 @@ let # tinyllama ps.transformers ps.accelerate + ps.sentencepiece ]); }; }; diff --git a/rocketv/src/RocketCore.scala b/rocketv/src/RocketCore.scala index 25eaf8d26..d144ed139 100644 --- a/rocketv/src/RocketCore.scala +++ b/rocketv/src/RocketCore.scala @@ -1609,8 +1609,8 @@ class Rocket(val parameter: RocketParameter) t1XRDRetireQueue.deq.ready := (!(wbWxd || (dmemResponseReplay && dmemResponseXpu)) || !vectorTryToWriteRd) && (!(dmemResponseReplay && dmemResponseFpu) || !vectorTryToWriteFP) when(t1XRDRetireQueue.deq.fire && vectorTryToWriteRd) { - longlatencyWdata := t1.retire.rd.bits.rdData - longlatencyWaddress := t1.retire.rd.bits.rdAddress + longlatencyWdata := t1XRDRetireQueue.deq.bits.rdData + longlatencyWaddress := t1XRDRetireQueue.deq.bits.rdAddress longLatencyWenable := true.B } io.fpu.foreach { fpu => diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 88fc63c5f..82ebe2c80 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -684,10 +684,8 @@ class T1(val parameter: T1Parameter) val completeIndexInstruction: Bool = ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle - val freeOR: Bool = VecInit(slots.map(_.state.idle)).asUInt.orR - /** slot is ready to accept new instructions. */ - val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR) + val slotReady: Bool = VecInit(slots.map(_.state.idle)).asUInt.andR val olderCheck: Bool = slots.map { re => // The same lsb will make it difficult to distinguish between the new and the old diff --git a/tests/builder.nix b/tests/builder.nix index 3b1a9b9ae..aeffd6929 100644 --- a/tests/builder.nix +++ b/tests/builder.nix @@ -44,6 +44,9 @@ let "-static" "-mcmodel=medany" "-fvisibility=hidden" + "-fno-exceptions" + "-fno-rtti" + "-fno-threadsafe-statics" "-fno-PIC" "-g" "-O3" diff --git a/tests/intrinsic/conv2d_resnet/conv2d_resnet.c b/tests/intrinsic/conv2d_resnet/conv2d_resnet.c new file mode 100644 index 000000000..2f7ed6200 --- /dev/null +++ b/tests/intrinsic/conv2d_resnet/conv2d_resnet.c @@ -0,0 +1,80 @@ +#include +#include +#include + +typedef int32_t vl_type; + +// Adapt from conv2d_less_m2, add outChannel and inChannel +// +// when AVL >= MAXVL, this is efficent +void conv2d(int32_t *restrict output_, int32_t const *restrict img_, + int32_t const *restrict kernel_, size_t imgRow, size_t imgCol, + size_t outChannel, size_t inChannel, size_t kernelSize) { + + size_t const outRow = imgRow - kernelSize + 1; + size_t const outCol = imgCol - kernelSize + 1; + + for (size_t coI = 0; coI < outChannel; coI++) { + for (size_t ciI = 0; ciI < inChannel; ciI++) { + int32_t *output = output_ + coI * (outRow * outCol); + int32_t const *img = img_ + ciI * (imgRow * imgCol); + int32_t const *kernel = kernel_ + (coI * inChannel + ciI) * kernelSize * kernelSize; + + for (size_t iI = 0; iI < imgRow; iI++) { + for (size_t kI = 0; kI < kernelSize; kI++) { + // only need img[kI] to img[imgRow + kI - kernelSize] + if (!(kI <= iI && iI < kI + outCol)) + continue; + + for (size_t kJ = 0; kJ < kernelSize; kJ++) { + int32_t const K = kernel[kI * kernelSize + kJ]; + // from img[iI][kJ] to img[iI][imgCol - kernelSize], step by 1 + // imgCol - kernelSize + 1 is the number of elements to be processed + int32_t const *imgPtr = img + iI * imgCol + kJ; + // from output[iI - kI][0] to output[iI - kI][imgCol - kernelSize], + int32_t *outPtr = output + (iI - kI) * outCol; + // when AVL >= MAXVL, this is efficent + size_t avl = imgCol - kernelSize + 1; + while (avl > 0) { + // TODO: exchange vl-loop and kJ loop, can be more cache friendly + + size_t vl = __riscv_vsetvl_e32m2(avl); + + vint32m2_t imgVec = __riscv_vle32_v_i32m2(imgPtr, vl); + vint32m2_t mulVec = __riscv_vmul_vx_i32m2(imgVec, K, vl); + + vint32m2_t outVec = __riscv_vle32_v_i32m2(outPtr, vl); + vint32m2_t resVec = __riscv_vadd_vv_i32m2(outVec, mulVec, vl); + + __riscv_vse32_v_i32m2(outPtr, resVec, vl); + + avl -= vl; + imgPtr += vl; + outPtr += vl; + } + } + } + } + } + } +} + +#define VDATA __attribute((section(".vdata"))) +#define VBASS __attribute((section(".vbss"))) + +// size of image is padded to (H+K-1, W+K-1) + +// CONV1: O[224, 224, 64], W[64, 3, 7, 7] +VDATA int32_t conv1_img[64*230*230]; +VDATA int32_t conv1_output[64*224*224]; +VDATA int32_t conv1_kernel[64*3*7*7]; + +// CONV2: ... + +int test() { + conv2d(conv1_output, conv1_img, conv1_kernel, 224, 224, 64, 3, 7); + + // conv2d(...) + + return 0; +} diff --git a/tests/pytorch/default.nix b/tests/pytorch/default.nix index 44a4c09ea..a8c9867cb 100644 --- a/tests/pytorch/default.nix +++ b/tests/pytorch/default.nix @@ -25,6 +25,7 @@ let nativeBuildInputs = [ buddy-mlir.pyenv buddy-mlir + buddy-mlir.llvm ]; src = sourcePath; @@ -41,7 +42,7 @@ let for mlir in ''${optArtifacts[@]}; do echo "Translating $mlir" - buddy-translate --buddy-to-llvmir "$mlir" -o "$mlir.ll" + mlir-translate --mlir-to-llvmir "$mlir" -o "$mlir.ll" translateArtifacts+=("$mlir.ll") done @@ -55,10 +56,10 @@ let for llvmir in ''${translateArtifacts[@]}; do echo "Compiling $llvmir" - buddy-llc "$llvmir" \ + llc "$llvmir" \ -mtriple=riscv32 \ -target-abi=ilp32f \ - -mattr=+m,+f,+zve32f \ + -mattr=+m,+f,+zvl4096b,+zve32f \ --filetype=obj \ -o "$llvmir.o" diff --git a/tests/pytorch/include/buddy/Core/Container.h b/tests/pytorch/include/buddy/Core/Container.h new file mode 100644 index 000000000..9f6229c19 --- /dev/null +++ b/tests/pytorch/include/buddy/Core/Container.h @@ -0,0 +1,144 @@ +//===- Container.h --------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// Container descriptor. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This is vendored version of buddy-mlir Container.h file at revision +// c57584a0e3c38e938a3902320f62b202ced84996. Modified for T1 embedded test env. +// +//===----------------------------------------------------------------------===// + +#include +#include + +// MemRef descriptor. +// - T represents the type of the elements. +// - N represents the number of dimensions. +// - The storage order is NCHW. +template class MemRef { +public: + // Construct using init to allocated area + constexpr MemRef(T *allocated, T init, intptr_t sizes[N]); + constexpr MemRef(T *allocated, intptr_t sizes[N], intptr_t offset = 0); + // Get the data pointer. + T *getData(); + // Get the sizes (shape). + const intptr_t *getSizes() { return sizes; } + // Get the strides. + const intptr_t *getStrides() { return strides; } + // Get the rank of the memref. + size_t getRank() const { return N; } + // Get the size (number of elements). + size_t getSize() const { return product(this->sizes); } + // Get the element at index. + const T &operator[](size_t index) const; + T &operator[](size_t index); + +protected: + // Set the strides. + // Computes the strides of the transposed tensor for transpose=true. + inline void setStrides(); + // Compute the product of array elements. + inline size_t product(const intptr_t sizes[N]) const; + + // Data. + // The `aligned` and `allocated` members point to the same address, `aligned` + // member is responsible for handling data, and `allocated` member is + // resposible for handling the memory space. + T *allocated = nullptr; + T *aligned = nullptr; + // Offset. + intptr_t offset = 0; + // Shape. + intptr_t sizes[N]; + // Strides. + intptr_t strides[N]; +}; + +template +constexpr MemRef::MemRef(T *allocated, T init, intptr_t sizes[N]) + : MemRef(allocated, sizes) { + size_t size = product(sizes); + std::fill(aligned, aligned + size, init); +} + +// MemRef Array Constructor. +// Construct a MemRef object from the data pointer, sizes, and offset. +// The default offset is 0. +template +constexpr MemRef::MemRef(T *data, intptr_t sizes[N], intptr_t offset) { + this->offset = offset; + for (size_t i = 0; i < N; i++) { + this->sizes[i] = sizes[i]; + } + setStrides(); + size_t size = product(sizes); + allocated = data; + aligned = allocated; + for (size_t i = 0; i < size; i++) { + aligned[i] = data[i]; + } +} + +// Get the data pointer. +// Return the `aligned` pointer if the container data size is greater than zero. +// If the data size is negative or zero, which means no space is allocated for +// the container data pointer, the function does not allow to return the data +// pointer. +template T *MemRef::getData() { + size_t size = product(this->sizes); + return aligned; +} + +// Get the element at index. +// Return the specific element if the container data size is greater than zero. +// If the data size is negative or zero, which means no space is allocated for +// the container data pointer, this operator does not allow to return the data +// element. +template +const T &MemRef::operator[](size_t index) const { + size_t size = product(this->sizes); + return aligned[index + offset]; +} + +template T &MemRef::operator[](size_t index) { + size_t size = product(this->sizes); + return aligned[index + offset]; +} + +// Calculate the stride values for each dimension based on the sizes. +template inline void MemRef::setStrides() { + strides[N - 1] = 1; + if (N < 2) + return; + // Prevent implicit conversions between unsigned and signed + for (std::size_t i = N - 1; i > 0; i--) { + strides[i - 1] = strides[i] * sizes[i]; + } +} + +// Calculate the total number of elements in the MemRef container. +template +inline size_t MemRef::product(const intptr_t sizes[N]) const { + size_t size = 1; + for (size_t i = 0; i < N; i++) + size *= sizes[i]; + return size; +} diff --git a/tests/pytorch/include/img.hpp b/tests/pytorch/include/img.hpp deleted file mode 100644 index f839df342..000000000 --- a/tests/pytorch/include/img.hpp +++ /dev/null @@ -1,234 +0,0 @@ -//===- ImgContainer.h -----------------------------------------------------===// -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -// -// original source from https://github.com/buddy-compiler/buddy-mlir -// Modified for embedded environment. -// -//===----------------------------------------------------------------------===// - -#ifndef IMG_H -#define IMG_H - -#include "memref.hpp" - -enum ImageModes { - DIP_GRAYSCALE = 0, - DIP_RGB = 1, -}; - -template class Image : public MemRef { -public: - // Constructor initializes the image by loading from a file. - // Params: - // file: Raw data to image file memory - // filesize: size of the file memory - // buffer: pre-allocated space for image modification - // mode: Specifies the image mode (e.g., DIP_GRAYSCALE, DIP_RGB). - // norm: Indicates whether to normalize pixel values (default is false). - constexpr Image(const char *file, int32_t filesize, T *buffer, - ImageModes mode, bool norm = false); - - // Overload - constexpr Image(T *data, const int32_t sizes[N]); - constexpr Image(T *data, T init, const int32_t sizes[N]); - - // Retrieves the name of the current image format as a string. - inline const char *getFormatName() const { - switch (this->imageFormat) { - case ImageFormat::BMP: - return "BMP"; - default: - return "Unsupported format"; - } - } - // Returns the width of the image in pixels. - inline size_t getWidth() const { return this->width; } - // Returns the height of the image in pixels. - inline size_t getHeight() const { return this->height; } - // Returns the bit depth of the image. - inline int getBitDepth() const { return this->bitDepth; } - -private: - // Enum to represent supported image formats. - enum class ImageFormat { - BMPDecodeError, // Represents an error or unsupported format. - BMP, // BMP file format. - Unsupported, - } imageFormat; - // Mode of the image (e.g., DIP_GRAYSCALE, DIP_RGB). - ImageModes imageMode; - // Width of the image in pixels. - int32_t width; - // Height of the image in pixels. - int32_t height; - // Bit depth of the image. - int32_t bitDepth; - // Normalization flag. - bool isNorm; - // Determines the image format from raw file data. - inline void determineFormat(const uint8_t *fileData, uint32_t filesize); - // Decodes a BMP image from raw file data. - inline bool decodeBMP(const uint8_t *fileData, uint32_t filesize); -}; - -template -constexpr Image::Image(T *data, const int32_t sizes[N]) - : MemRef(data, sizes){}; - -template -constexpr Image::Image(T *data, T init, const int32_t sizes[N]) - : MemRef(data, init, sizes){}; - -// Image Container Constructor -// Constructs an image container object from the image file path. -template -constexpr Image::Image(const char *file, int32_t filesize, T *buffer, - ImageModes mode, bool norm) - : imageMode(mode), isNorm(norm) { - - this->allocated = buffer; - this->aligned = this->allocated; - - determineFormat(file, filesize); - if (this->imageFormat == ImageFormat::BMP) { - bool success = decodeBMP(file, filesize); - if (!success) { - this->imageFormat = ImageFormat::BMPDecodeError; - }; - } else { - this->imageFormat = ImageFormat::Unsupported; - } -} - -// Determines the image format by inspecting the header of the file data. -template -inline void Image::determineFormat(const uint8_t *fileData, - uint32_t filesize) { - if (filesize > 2 && fileData[0] == 'B' && fileData[1] == 'M') { - this->imageFormat = ImageFormat::BMP; - } else { - this->imageFormat = ImageFormat::BMPDecodeError; - } -} - -// BMP Image File Decoder -template -inline bool Image::decodeBMP(const uint8_t *fileData, uint32_t filesize) { - // Check if the provided data is large enough to contain a minimal BMP header - // (54 bytes). - if (filesize < 54) { - return false; - } - - // Extract image information from BMP header - this->width = *reinterpret_cast(&fileData[18]); - this->height = *reinterpret_cast(&fileData[22]); - this->bitDepth = *reinterpret_cast(&fileData[28]); - uint32_t compression = *reinterpret_cast(&fileData[30]); - size_t pixelDataOffset = *reinterpret_cast(&fileData[10]); - - // Currently, only the BI_RGB (value 0) compression method is supported. - if (compression != 0) { - return false; - } - - // Currently, only the NCHW format with 4 dimensions is supported. - if (N == 4) { - if (this->imageMode == ImageModes::DIP_GRAYSCALE) { - // TODO: Add batch setting. - this->sizes[0] = 1; - this->sizes[1] = 1; - this->sizes[2] = this->height; - this->sizes[3] = this->width; - this->setStrides(); - size_t size = this->product(this->sizes); - // Fullfill data to memref container. - size_t memrefIndex = 0; - if (this->bitDepth == 32) { - // BMP file is upside-down storage. - for (size_t i = this->height; i > 0; i--) { - for (size_t j = 0; j < this->width; j++) { - // Locate the current pixel. - size_t pixelIndex = - pixelDataOffset + (((i - 1) * this->width) + j) * 4; - // Extract the blue, green, and red value from the current pixel. - int bluePixel = - *reinterpret_cast(&fileData[pixelIndex]); - int greenPixel = - *reinterpret_cast(&fileData[pixelIndex + 1]); - int redPixel = - *reinterpret_cast(&fileData[pixelIndex + 2]); - // Calculate the gray scale value. - int grayScaleValue = static_cast( - 0.299 * redPixel + 0.587 * greenPixel + 0.114 * bluePixel); - // Store the gray scale value into memref container. - this->aligned[memrefIndex] = - this->isNorm ? static_cast(grayScaleValue) / 255 - : static_cast(grayScaleValue); - memrefIndex++; - } - } - } else { - return false; - } - } else if (this->imageMode == ImageModes::DIP_RGB) { - // TODO: Add batch setting. - this->sizes[0] = 1; - this->sizes[1] = 3; - this->sizes[2] = this->height; - this->sizes[3] = this->width; - this->setStrides(); - size_t size = this->product(this->sizes); - // Fullfill data to memref container. - size_t memrefIndex = 0; - size_t colorStride = this->height * this->width; - if (this->bitDepth == 32) { - // BMP file is upside-down storage. - for (size_t i = height; i > 0; i--) { - for (size_t j = 0; j < width; j++) { - // Locate the current pixel. - size_t pixelIndex = pixelDataOffset + (((i - 1) * width) + j) * 4; - // Extract the blue, green, and red value from the current pixel. - int bluePixel = - *reinterpret_cast(&fileData[pixelIndex]); - int greenPixel = - *reinterpret_cast(&fileData[pixelIndex + 1]); - int redPixel = - *reinterpret_cast(&fileData[pixelIndex + 2]); - // Store the values into memref container as RGB order. (BGR -> RGB) - this->aligned[memrefIndex] = this->isNorm - ? static_cast(redPixel) / 255 - : static_cast(redPixel); - this->aligned[memrefIndex + colorStride] = - this->isNorm ? static_cast(greenPixel) / 255 - : static_cast(greenPixel); - this->aligned[memrefIndex + 2 * colorStride] = - this->isNorm ? static_cast(bluePixel) / 255 - : static_cast(bluePixel); - memrefIndex++; - } - } - } else { - return false; - } - } - } else { - return false; - } - return true; -} - -#endif // IMG_H diff --git a/tests/pytorch/include/memref.hpp b/tests/pytorch/include/memref.hpp deleted file mode 100644 index 95e6725f2..000000000 --- a/tests/pytorch/include/memref.hpp +++ /dev/null @@ -1,80 +0,0 @@ -//===- ImgContainer.h -----------------------------------------------------===// -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -// -// Original source from https://github.com/buddy-compiler/buddy-mlir. -// Modified for embedded environment. -// -//===----------------------------------------------------------------------===// - -#ifndef MEMREF_H -#define MEMREF_H - -#include -#include - -template class MemRef { -public: - constexpr MemRef(T *data, const int32_t sizes[N]); - constexpr MemRef(T *data, T init, const int32_t sizes[N]); - -protected: - inline void setStrides(); - - // https://github.com/llvm/llvm-project/blob/a50b9633357007ff886f3fd228ca4b8a9b9b9852/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp#L401 - T *allocated = nullptr; - T *aligned = nullptr; - int32_t offset = 0; - int32_t sizes[N]; - int32_t strides[N]; -}; - -template -constexpr MemRef::MemRef(T *data, const int32_t sizes[N]) { - for (size_t i = 0; i < N; i++) { - this->sizes[i] = sizes[i]; - } - - setStrides(); - - allocated = data; - aligned = data; -} - -template -constexpr MemRef::MemRef(T *data, T init, const int32_t sizes[N]) - : MemRef(data, sizes) { - - int32_t total_size = 0; - for (size_t i = 0; i < N; i++) { - total_size += sizes[i]; - } - - for (int32_t i = 0; i < total_size; i++) { - aligned[i] = init; - } -} - -template inline void MemRef::setStrides() { - strides[N - 1] = 1; - if (N < 2) - return; - - for (std::size_t i = N - 1; i > 0; i--) { - strides[i - 1] = strides[i] * sizes[i]; - } -} - -#endif // MEMREF_H diff --git a/tests/pytorch/lenet/build.nix b/tests/pytorch/lenet/build.nix index baabc0454..31357cc4d 100644 --- a/tests/pytorch/lenet/build.nix +++ b/tests/pytorch/lenet/build.nix @@ -14,43 +14,42 @@ buildBuddyE2ETest { echo "Lowering forward.mlir" buddy-opt forward.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \ empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \ | buddy-opt -pass-pipeline \ "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ - convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \ - convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \ - convert-func-to-llvm, reconcile-unrealized-casts)" \ + convert-math-to-llvm, convert-scf-to-cf, \ + convert-arith-to-llvm{index-bitwidth=32}, expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \ + convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \ > forward-lowered.mlir echo "Lowering subgraphs[0]" buddy-opt subgraphs0.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \ | buddy-opt \ - --convert-elementwise-to-linalg \ + --eliminate-empty-tensors \ + --convert-tensor-to-linalg \ + --linalg-bufferize \ + --convert-linalg-to-loops \ + --lower-affine \ --func-bufferize-dynamic-offset \ --arith-bufferize \ - --func-bufferize \ --tensor-bufferize \ - --linalg-bufferize \ + --buffer-deallocation \ --finalizing-bufferize \ - --batchmatmul-optimize \ - --convert-linalg-to-affine-loops \ - --lower-affine \ - --convert-vector-to-scf \ - --convert-scf-to-cf \ + --memref-expand \ --llvm-request-c-wrappers \ + --convert-vector-to-llvm=force-32bit-vector-indices \ --lower-vector-exp \ --lower-rvv=rv32 \ - --convert-vector-to-llvm \ - --convert-math-to-llvm \ - --convert-math-to-libm \ - --convert-arith-to-llvm \ - --convert-func-to-llvm \ --expand-strided-metadata \ - --finalize-memref-to-llvm \ + --finalize-memref-to-llvm=index-bitwidth=32 \ + --convert-index-to-llvm=index-bitwidth=32 \ + --convert-scf-to-cf \ + --convert-arith-to-llvm=index-bitwidth=32 \ + --convert-func-to-llvm=index-bitwidth=32 \ --reconcile-unrealized-casts \ > subgraphs0-lowered.mlir @@ -58,5 +57,8 @@ buildBuddyE2ETest { "forward-lowered.mlir" "subgraphs0-lowered.mlir" ) + + mkdir -p "$out/resources" + cp -v ''${optArtifacts[*]} "$out/resources" ''; } diff --git a/tests/pytorch/lenet/lenet.cc b/tests/pytorch/lenet/lenet.cc index 3cb5d1a88..226ffe04c 100644 --- a/tests/pytorch/lenet/lenet.cc +++ b/tests/pytorch/lenet/lenet.cc @@ -1,4 +1,4 @@ -#include "memref.hpp" +#include #define INPUT_N 1 #define INPUT_C 1 @@ -13,14 +13,9 @@ __attribute((section(".vdata"))) float output_0[OUTPUT_N]; __attribute((section(".vdata"))) float param_0[PARAM_N]; // Define the sizes of the input and output tensors. -static const int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W}; -static const int32_t sizesOutput[2] = {1, OUTPUT_N}; -static const int32_t sizesParams[1] = {PARAM_N}; - -// Create input and output containers for the image and model output. -MemRef input(input_0, sizesInput); -MemRef output(output_0, sizesOutput); -MemRef params(param_0, 2.0, sizesParams); +static int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W}; +static int32_t sizesOutput[2] = {1, OUTPUT_N}; +static int32_t sizesParams[1] = {PARAM_N}; // Declare the target model C interface. extern "C" { @@ -29,6 +24,11 @@ void _mlir_ciface_forward(MemRef *output, MemRef *arg0, } extern "C" int test() { + // Create input and output containers for the image and model output. + MemRef input(input_0, sizesInput); + MemRef output(output_0, sizesOutput); + MemRef params(param_0, 2.0, sizesParams); + _mlir_ciface_forward(&output, ¶ms, &input); return 0; } diff --git a/tests/pytorch/lib/MemrefCopy.cc b/tests/pytorch/lib/MemrefCopy.cc index c7f15ff79..073333331 100644 --- a/tests/pytorch/lib/MemrefCopy.cc +++ b/tests/pytorch/lib/MemrefCopy.cc @@ -24,38 +24,38 @@ template struct UnrankedMemRefType { template struct StridedMemRefType { T *basePtr; T *data; - int64_t offset; - int64_t sizes[N]; - int64_t strides[N]; + int32_t offset; + int32_t sizes[N]; + int32_t strides[N]; }; /// StridedMemRef descriptor type specialized for rank 1. template struct StridedMemRefType { T *basePtr; T *data; - int64_t offset; - int64_t sizes[1]; - int64_t strides[1]; + int32_t offset; + int32_t sizes[1]; + int32_t strides[1]; - T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); } + T &operator[](int32_t idx) { return *(data + offset + idx * strides[0]); } }; /// StridedMemRef descriptor type specialized for rank 0. template struct StridedMemRefType { T *basePtr; T *data; - int64_t offset; + int32_t offset; }; // A reference to one of the StridedMemRef types. template class DynamicMemRefType { public: - int64_t rank; + int32_t rank; T *basePtr; T *data; - int64_t offset; - const int64_t *sizes; - const int64_t *strides; + int32_t offset; + const int32_t *sizes; + const int32_t *strides; explicit DynamicMemRefType(const StridedMemRefType &memRef) : rank(0), basePtr(memRef.basePtr), data(memRef.data), @@ -75,6 +75,7 @@ template class DynamicMemRefType { } }; +// TODO: can we vectorize this extern "C" void memrefCopy(int32_t elemSize, UnrankedMemRefType *srcArg, UnrankedMemRefType *dstArg) { DynamicMemRefType src(*srcArg); diff --git a/tests/pytorch/matmul/build.nix b/tests/pytorch/matmul/build.nix index 9e84d8219..67cd326c1 100644 --- a/tests/pytorch/matmul/build.nix +++ b/tests/pytorch/matmul/build.nix @@ -6,29 +6,26 @@ buildBuddyE2ETest { echo "Lowering forward.mlir" python ./matmul.py \ - | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \ - | buddy-opt --convert-elementwise-to-linalg \ + | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \ + | buddy-opt \ + --convert-elementwise-to-linalg \ + --one-shot-bufferize="bufferize-function-boundaries" \ --func-bufferize-dynamic-offset \ - --arith-bufferize \ - --func-bufferize \ - --tensor-bufferize \ - --linalg-bufferize \ - --finalizing-bufferize \ - --batchmatmul-optimize \ --convert-linalg-to-affine-loops \ + --batchmatmul-optimize \ --lower-affine \ --lower-vector-exp \ --lower-rvv=rv32 \ --convert-vector-to-scf \ --convert-scf-to-cf \ + --convert-cf-to-llvm \ --llvm-request-c-wrappers \ --convert-vector-to-llvm \ --convert-math-to-llvm \ - --convert-math-to-libm \ - --convert-arith-to-llvm \ - --convert-func-to-llvm \ + --convert-arith-to-llvm=index-bitwidth=32 \ + --convert-func-to-llvm=index-bitwidth=32 \ --expand-strided-metadata \ - --finalize-memref-to-llvm \ + --finalize-memref-to-llvm=index-bitwidth=32 \ --reconcile-unrealized-casts \ -o forward-lowered.mlir diff --git a/tests/pytorch/matmul/matmul.cc b/tests/pytorch/matmul/matmul.cc index b523f0626..2e1a020dd 100644 --- a/tests/pytorch/matmul/matmul.cc +++ b/tests/pytorch/matmul/matmul.cc @@ -1,22 +1,22 @@ -#include "memref.hpp" +#include extern "C" void _mlir_ciface_forward(MemRef *output, MemRef *arg1, MemRef *arg2); -// One-dimension, with length 512 -static const int32_t sizes[3] = {8, 8, 8}; - __attribute((section(".vdata"))) float input_float_1[512]; -MemRef input1(input_float_1, sizes); - __attribute((section(".vdata"))) float input_float_2[512]; -MemRef input2(input_float_2, sizes); - __attribute((section(".vdata"))) float output_float_1[512]; -MemRef output(output_float_1, sizes); extern "C" int test() { + // One-dimension, with length 512 + static int32_t sizes[3] = {8, 8, 8}; + + MemRef input1(input_float_1, sizes); + MemRef input2(input_float_2, sizes); + MemRef output(output_float_1, sizes); + _mlir_ciface_forward(&output, &input1, &input2); + return 0; } diff --git a/tests/pytorch/mobilenet/build.nix b/tests/pytorch/mobilenet/build.nix index ac666cc68..2eddc68c6 100644 --- a/tests/pytorch/mobilenet/build.nix +++ b/tests/pytorch/mobilenet/build.nix @@ -20,20 +20,20 @@ buildBuddyE2ETest { echo "Lowering forward.mlir" buddy-opt forward.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \ empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \ | buddy-opt -pass-pipeline \ "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ - convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \ - convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \ - convert-func-to-llvm, reconcile-unrealized-casts)" \ + convert-math-to-llvm, convert-scf-to-cf, \ + convert-arith-to-llvm{index-bitwidth=32}, expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \ + convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \ > forward-lowered.mlir echo "Lowering subgraphs[0]" buddy-opt subgraphs0.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \ | buddy-opt \ --convert-elementwise-to-linalg \ --func-bufferize-dynamic-offset \ @@ -42,7 +42,6 @@ buildBuddyE2ETest { --tensor-bufferize \ --linalg-bufferize \ --finalizing-bufferize \ - --batchmatmul-optimize \ --convert-linalg-to-affine-loops \ --lower-affine \ --convert-vector-to-scf \ @@ -52,11 +51,10 @@ buildBuddyE2ETest { --lower-rvv=rv32 \ --convert-vector-to-llvm \ --convert-math-to-llvm \ - --convert-math-to-libm \ - --convert-arith-to-llvm \ - --convert-func-to-llvm \ + --convert-arith-to-llvm=index-bitwidth=32 \ + --convert-func-to-llvm=index-bitwidth=32 \ --expand-strided-metadata \ - --finalize-memref-to-llvm \ + --finalize-memref-to-llvm=index-bitwidth=32 \ --reconcile-unrealized-casts \ > subgraphs0-lowered.mlir diff --git a/tests/pytorch/mobilenet/mobilenet.cc b/tests/pytorch/mobilenet/mobilenet.cc index d64064fbb..8b398d074 100644 --- a/tests/pytorch/mobilenet/mobilenet.cc +++ b/tests/pytorch/mobilenet/mobilenet.cc @@ -15,18 +15,17 @@ __attribute((section(".vdata"))) float output_0[OUTPUT_N]; __attribute((section(".vdata"))) float param_0[PARAM_N0]; __attribute((section(".vdata"))) int64_t param_1[PARAM_N1]; -// Define the sizes of the input and output tensors. -static const int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W}; -static const int32_t sizesOutput[2] = {1, OUTPUT_N}; -static const int32_t sizesParam0[1] = {PARAM_N0}; -static const int32_t sizesParam1[1] = {PARAM_N1}; - extern "C" { void _mlir_ciface_forward(MemRef *output, MemRef *arg0, MemRef *arg1, Image *input); } extern "C" int test() { + // Define the sizes of the input and output tensors. + static int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W}; + static int32_t sizesOutput[2] = {1, OUTPUT_N}; + static int32_t sizesParam0[1] = {PARAM_N0}; + static int32_t sizesParam1[1] = {PARAM_N1}; // Generate input memref container with random numbers. const int inputSize = INPUT_N * INPUT_C * INPUT_H * INPUT_W; diff --git a/tests/pytorch/resnet18/build.nix b/tests/pytorch/resnet18/build.nix new file mode 100644 index 000000000..a143aace9 --- /dev/null +++ b/tests/pytorch/resnet18/build.nix @@ -0,0 +1,76 @@ +{ + fetchurl, + buildBuddyE2ETest, +}: +let + checkpointFile = "resnet18-f37072fd.pth"; + modelCache = fetchurl { + url = "https://download.pytorch.org/models/${checkpointFile}"; + hash = "sha256-83By/UfonF6CdiHFuv+nUAgZ94lrus7BYLGhbFYOB+w="; + }; +in +buildBuddyE2ETest { + caseName = "resnet18"; + + optPhase = '' + mkdir -p pytorchCache/hub/checkpoints/ + cp -v ${modelCache} pytorchCache/hub/checkpoints/${checkpointFile} + export TORCH_HOME=pytorchCache + + python3 ./resnet18.py --output-dir $PWD + + echo "Lowering forward.mlir" + buddy-opt forward.mlir -pass-pipeline \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith{use-32-bit}), \ + empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, convert-vector-to-llvm{force-32bit-vector-indices index-bitwidth=32})" \ + | buddy-opt -pass-pipeline \ + "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ + eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ + convert-math-to-llvm, convert-scf-to-cf, \ + convert-arith-to-llvm{index-bitwidth=32}, convert-func-to-llvm{index-bitwidth=32}, \ + expand-strided-metadata, finalize-memref-to-llvm{index-bitwidth=32}, \ + convert-func-to-llvm{index-bitwidth=32}, reconcile-unrealized-casts)" \ + > forward-lowered.mlir + + echo "Lowering subgraph0.mlir" + buddy-opt subgraph0.mlir -pass-pipeline \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \ + | buddy-opt \ + --convert-elementwise-to-linalg \ + --one-shot-bufferize="bufferize-function-boundaries" \ + --func-bufferize-dynamic-offset \ + --conv-nhwc-fhwc-optimize \ + --batchmatmul-optimize \ + --convert-linalg-to-loops \ + --expand-strided-metadata \ + --llvm-request-c-wrappers \ + --lower-affine \ + --convert-vector-to-llvm="force-32bit-vector-indices index-bitwidth=32" \ + --convert-scf-to-cf \ + --convert-cf-to-llvm=index-bitwidth=32 \ + --convert-arith-to-llvm=index-bitwidth=32 \ + --convert-math-to-llvm \ + --convert-func-to-llvm=index-bitwidth=32 \ + --finalize-memref-to-llvm=index-bitwidth=32 \ + --convert-index-to-llvm=index-bitwidth=32 \ + --reconcile-unrealized-casts \ + --mlir-print-ir-after-change \ + --verify-each \ + > subgraph0-lowered.mlir + + echo "Compiling memrefCopy library" + $CXX -nostdlib -c ${../lib/MemrefCopy.cc} -o memrefCopy.o + + llcArtifacts+=( + memrefCopy.o + ) + + optArtifacts+=( + "forward-lowered.mlir" + "subgraph0-lowered.mlir" + ) + + mkdir -p "$out"/share + cp -v ''${optArtifacts[*]} "$out"/share/ + ''; +} diff --git a/tests/pytorch/resnet18/dog-224_224.png b/tests/pytorch/resnet18/dog-224_224.png new file mode 100644 index 000000000..4c6649714 Binary files /dev/null and b/tests/pytorch/resnet18/dog-224_224.png differ diff --git a/tests/pytorch/resnet18/resnet18.cc b/tests/pytorch/resnet18/resnet18.cc new file mode 100644 index 000000000..05378626b --- /dev/null +++ b/tests/pytorch/resnet18/resnet18.cc @@ -0,0 +1,24 @@ +#include + +// Declare the resnet C interface. +extern "C" void _mlir_ciface_forward(MemRef *output, + MemRef *arg0, + MemRef *input); + +__attribute((section(".vdata"))) float output_float_1[1000]; +__attribute((section(".vdata"))) float IMAGE[150528]; +__attribute((section(".vdata"))) float PARAMS[11699112]; + +extern "C" int test() { + static int32_t sizes[2] = {1, 1000}; + static int32_t params_sizes[1] = {11699112}; + static int32_t image_sizes[4] = {1, 3, 224, 224}; + + MemRef output(output_float_1, 7.0, sizes); + + MemRef inputResize(IMAGE, 6.0, image_sizes); + MemRef paramsContainer(PARAMS, 5.0, params_sizes); + + _mlir_ciface_forward(&output, ¶msContainer, &inputResize); + return 0; +} diff --git a/tests/pytorch/resnet18/resnet18.py b/tests/pytorch/resnet18/resnet18.py new file mode 100644 index 000000000..1cbc61bc4 --- /dev/null +++ b/tests/pytorch/resnet18/resnet18.py @@ -0,0 +1,98 @@ +# ===- buddy-resnet-import.py -------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the ResNet18 model AOT importer. +# +# ===--------------------------------------------------------------------------- + +import os +import argparse +from pathlib import Path +import numpy as np +import torch +import torchvision.models as models +import torch._inductor.lowering +from torch._inductor.decomposition import decompositions as inductor_decomp +from torch._decomp import remove_decompositions + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.graph import GraphDriver +from buddy.compiler.graph.transform import simply_fuse +from buddy.compiler.ops import tosa + +# Parse command-line arguments +parser = argparse.ArgumentParser(description="ResNet18 model AOT importer") +parser.add_argument( + "--output-dir", type=str, default="./", help="Directory to save output files." +) +args = parser.parse_args() + +# Ensure output directory exists +output_dir = os.path.abspath(args.output_dir) +os.makedirs(output_dir, exist_ok=True) + +# Retrieve the ResNet18 model path. +model_path = os.path.dirname(os.path.abspath(__file__)) + +model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT) +model = model.eval() + +# Remove the num_batches_tracked attribute. +for layer in model.modules(): + if isinstance(layer, torch.nn.BatchNorm2d): + if hasattr(layer, "num_batches_tracked"): + del layer.num_batches_tracked + +DEFAULT_DECOMPOSITIONS = [ + torch.ops.aten.max_pool2d_with_indices.default, +] + +remove_decompositions(inductor_decomp, DEFAULT_DECOMPOSITIONS) + +# Initialize Dynamo Compiler with specific configurations as an importer. +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) +data = torch.randn([1, 3, 224, 224]) +# Import the model into MLIR module and parameters. +with torch.no_grad(): + graphs = dynamo_compiler.importer(model, data) +assert len(graphs) == 1 +graph = graphs[0] +params = dynamo_compiler.imported_params[graph] +pattern_list = [simply_fuse] +graphs[0].fuse_ops(pattern_list) +driver = GraphDriver(graphs[0]) +driver.subgraphs[0].lower_to_top_level_ir() + +# Write the MLIR module and forward graph to the specified output directory +with open(os.path.join(output_dir, "subgraph0.mlir"), "w") as module_file: + print(driver.subgraphs[0]._imported_module, file=module_file) +with open(os.path.join(output_dir, "forward.mlir"), "w") as module_file: + print(driver.construct_main_graph(True), file=module_file) + +params = dynamo_compiler.imported_params[graph] +current_path = os.path.dirname(os.path.abspath(__file__)) + +float32_param = np.concatenate( + [ + param.detach().numpy().reshape([-1]) + for param in params + if param.dtype == torch.float32 + ] +) +float32_param.tofile(Path(output_dir) / "arg0.data") diff --git a/tests/pytorch/tinyllama/build.nix b/tests/pytorch/tinyllama/build.nix index 24e4c00d7..e07a2a895 100644 --- a/tests/pytorch/tinyllama/build.nix +++ b/tests/pytorch/tinyllama/build.nix @@ -14,45 +14,21 @@ buildBuddyE2ETest { env.LLAMA_MODEL_PATH = "${model}"; optPhase = '' - python ./tinyllama.py + python3 ./tinyllama.py --output-dir $PWD echo "Lowering forward.mlir" - buddy-opt forward.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),\ - func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ - | buddy-opt --arith-expand \ - --eliminate-empty-tensors \ - --empty-tensor-to-alloc-tensor \ - --one-shot-bufferize \ - --matmul-parallel-vectorization-optimize \ - --batchmatmul-optimize \ - --convert-linalg-to-affine-loops \ - --affine-loop-fusion \ - --lower-affine \ - --func-bufferize \ - --arith-bufferize \ - --tensor-bufferize \ - --buffer-deallocation \ - --finalizing-bufferize \ - --convert-vector-to-scf \ - --expand-strided-metadata \ - --convert-vector-to-llvm \ - --memref-expand \ - --arith-expand \ - --convert-arith-to-llvm \ - --finalize-memref-to-llvm \ - --convert-scf-to-cf \ - --llvm-request-c-wrappers \ - --convert-arith-to-llvm \ - --convert-math-to-llvm \ - --convert-math-to-libm \ - --convert-func-to-llvm \ - --reconcile-unrealized-casts \ + cat forward.mlir \ + | buddy-opt \ + --expand-strided-metadata \ + --finalize-memref-to-llvm=index-bitwidth=32 \ + --llvm-request-c-wrappers \ + --convert-func-to-llvm=index-bitwidth=32 \ + --reconcile-unrealized-casts \ > forward-lowered.mlir echo "Lowering subgraphs[0]" - buddy-opt subgraphs0.mlir -pass-pipeline \ - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \ + buddy-opt subgraph0.mlir -pass-pipeline \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith{use-32-bit}, tosa-to-linalg, tosa-to-tensor))" \ | buddy-opt \ --convert-elementwise-to-linalg \ --arith-expand \ @@ -64,29 +40,26 @@ buildBuddyE2ETest { --arith-bufferize \ --buffer-deallocation \ --finalizing-bufferize \ - --matmul-parallel-vectorization-optimize \ - --batchmatmul-optimize \ --convert-linalg-to-affine-loops \ --affine-loop-fusion \ --lower-affine \ --convert-vector-to-scf \ --expand-strided-metadata \ + --llvm-request-c-wrappers \ --cse \ --lower-vector-exp \ --lower-rvv=rv32 \ --convert-vector-to-llvm \ --memref-expand \ --arith-expand \ - --convert-arith-to-llvm \ - --finalize-memref-to-llvm \ + --convert-arith-to-llvm=index-bitwidth=32 \ + --finalize-memref-to-llvm=index-bitwidth=32 \ --convert-scf-to-cf \ - --llvm-request-c-wrappers \ - --convert-arith-to-llvm \ + --convert-arith-to-llvm=index-bitwidth=32 \ --convert-math-to-llvm \ - --convert-math-to-libm \ - --convert-func-to-llvm \ + --convert-func-to-llvm=index-bitwidth=32 \ --reconcile-unrealized-casts \ - > subgraphs0-lowered.mlir + > subgraph0-lowered.mlir echo "Compiling memrefCopy library" $CXX -nostdlib -c ${../lib/MemrefCopy.cc} -o memrefCopy.o @@ -96,7 +69,10 @@ buildBuddyE2ETest { optArtifacts+=( "forward-lowered.mlir" - "subgraphs0-lowered.mlir" + "subgraph0-lowered.mlir" ) + + mkdir -p "$out/resources" + cp -v ''${optArtifacts[*]} "$out/resources" ''; } diff --git a/tests/pytorch/tinyllama/tinyllama.cc b/tests/pytorch/tinyllama/tinyllama.cc index 705d16985..b53d2ade1 100644 --- a/tests/pytorch/tinyllama/tinyllama.cc +++ b/tests/pytorch/tinyllama/tinyllama.cc @@ -18,43 +18,44 @@ // //===----------------------------------------------------------------------===// -#include "memref.hpp" +#include +#include -constexpr size_t ParamsSize = 110581; -// constexpr size_t ParamsSize = 11058; -constexpr size_t MaxVocabSize = 32000; -constexpr size_t MaxTokenLength = 40; -constexpr size_t HiddenSize = 2048; +#define PARAMS_SIZE 673 +#define MAX_VOCAB_SIZE 320 +#define MAX_TOKEN_LENGTH 40 +#define HIDDEN_SIZE 128 // resultContainer[0] -__attribute((section(".vdata"))) float result0[1 + MaxTokenLength + HiddenSize]; -static constexpr int32_t sizesResult0[3] = {1, MaxTokenLength, HiddenSize}; - +__attribute(( + section(".vdata"))) float result0[1 + MAX_TOKEN_LENGTH + HIDDEN_SIZE]; // resultContainer[1] __attribute(( - section(".vdata"))) float result1[1 + MaxTokenLength + MaxVocabSize]; -static constexpr int32_t sizesResult1[3] = {1, MaxTokenLength, MaxVocabSize}; - + section(".vdata"))) float result1[1 + MAX_TOKEN_LENGTH + MAX_VOCAB_SIZE]; // inputContainer -__attribute((section(".vdata"))) int32_t input[1 + MaxTokenLength]; -static constexpr int32_t sizesInput[2] = {1, MaxTokenLength}; - +__attribute((section(".vdata"))) int32_t input[1 + MAX_TOKEN_LENGTH]; // paramsContainer -__attribute((section(".vdata"))) float param[ParamsSize]; -static constexpr int32_t sizesParam[1] = {ParamsSize}; +__attribute((section(".vdata"))) float param[PARAMS_SIZE]; extern "C" { void _mlir_ciface_forward(MemRef *a, MemRef *b, MemRef *c); } -MemRef resultContainer[2] = { - MemRef(result0, 2.0, sizesResult0), - MemRef(result1, 3.0, sizesResult1)}; -MemRef inputContainer(input, 4, sizesInput); -MemRef paramsContainerf32(param, 5.0, sizesParam); - extern "C" int test() { + int32_t sizesResult0[3] = {1, MAX_TOKEN_LENGTH, HIDDEN_SIZE}; + int32_t sizesResult1[3] = {1, MAX_TOKEN_LENGTH, MAX_VOCAB_SIZE}; + + int32_t sizesInput[2] = {1, MAX_TOKEN_LENGTH}; + MemRef inputContainer(input, 4.0, sizesInput); + + MemRef resultContainer[2] = { + MemRef(result0, 2.0, sizesResult0), + MemRef(result1, 3.0, sizesResult1)}; + + int32_t sizesParam[1] = {PARAMS_SIZE}; + MemRef paramsContainerf32(param, 5.0, sizesParam); + _mlir_ciface_forward(resultContainer, ¶msContainerf32, &inputContainer); return 0; } diff --git a/tests/pytorch/tinyllama/tinyllama.py b/tests/pytorch/tinyllama/tinyllama.py index bd60c6567..b47bbb605 100644 --- a/tests/pytorch/tinyllama/tinyllama.py +++ b/tests/pytorch/tinyllama/tinyllama.py @@ -1,39 +1,37 @@ -# ===- buddy_tinyllama_import.py ----------------------------------------------- -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ===--------------------------------------------------------------------------- -# -# This is the TinyLlama model AOT importer. -# -# ===--------------------------------------------------------------------------- - import os -import sys +import argparse import torch +import torch._dynamo as dynamo +from transformers import LlamaForCausalLM, LlamaTokenizer from torch._inductor.decomposition import decompositions as inductor_decomp -from transformers import AutoModelForCausalLM, AutoTokenizer +import numpy from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.ops import tosa from buddy.compiler.graph import GraphDriver -from buddy.compiler.graph.transform import simply_fuse +from buddy.compiler.graph.transform import simply_fuse, apply_classic_fusion + +# Add argument parser to allow custom output directory. +parser = argparse.ArgumentParser(description="LLaMA2 model AOT importer") +parser.add_argument( + "--output-dir", type=str, default="./", help="Directory to save output files." +) +args = parser.parse_args() -checkpoint = os.environ.get("LLAMA_MODEL_PATH") -if checkpoint is None: - sys.exit("Error: No model path was provided. Please set $LLAMA_MODEL_PATH") -tokenizer = AutoTokenizer.from_pretrained(checkpoint) -model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto") +# Ensure the output directory exists. +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) + +# Retrieve the LLaMA model path from environment variables. +model_path = os.environ.get("LLAMA_MODEL_PATH") +if model_path is None: + raise EnvironmentError( + "The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid." + ) + +# Initialize the tokenizer and model from the specified model path. +tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True) +model = LlamaForCausalLM.from_pretrained(model_path, torchscript=True) model.config.use_cache = False # Initialize Dynamo Compiler with specific configurations as an importer. @@ -54,7 +52,9 @@ graphs[0].fuse_ops(pattern_list) driver = GraphDriver(graphs[0]) driver.subgraphs[0].lower_to_top_level_ir() -with open("subgraphs0.mlir", "w") as module_file: + +# Save the generated files to the specified output directory. +with open(os.path.join(output_dir, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) -with open("forward.mlir", "w") as module_file: +with open(os.path.join(output_dir, "forward.mlir"), "w") as module_file: print(driver.construct_main_graph(True), file=module_file)